All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.kylin.tool.garbage.StorageCleaner Maven / Gradle / Ivy

There is a newer version: 5.0.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.kylin.tool.garbage;

import static org.apache.kylin.common.util.HadoopUtil.FLAT_TABLE_STORAGE_ROOT;
import static org.apache.kylin.common.util.HadoopUtil.GLOBAL_DICT_STORAGE_ROOT;
import static org.apache.kylin.common.util.HadoopUtil.JOB_TMP_ROOT;
import static org.apache.kylin.common.util.HadoopUtil.PARQUET_STORAGE_ROOT;
import static org.apache.kylin.common.util.HadoopUtil.SNAPSHOT_STORAGE_ROOT;
import static org.apache.kylin.common.util.HadoopUtil.TABLE_EXD_STORAGE_ROOT;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import org.apache.commons.collections.CollectionUtils;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.kylin.common.KapConfig;
import org.apache.kylin.common.KylinConfig;
import org.apache.kylin.common.persistence.RawResource;
import org.apache.kylin.common.persistence.ResourceStore;
import org.apache.kylin.common.persistence.RootPersistentEntity;
import org.apache.kylin.common.persistence.TrashRecord;
import org.apache.kylin.common.persistence.transaction.UnitOfWork;
import org.apache.kylin.common.util.CliCommandExecutor;
import org.apache.kylin.common.util.CliCommandExecutor.CliCmdExecResult;
import org.apache.kylin.common.util.HadoopUtil;
import org.apache.kylin.common.util.JsonUtil;
import org.apache.kylin.common.util.Pair;
import org.apache.kylin.common.util.ShellException;
import org.apache.kylin.job.execution.ExecutableState;
import org.apache.kylin.job.execution.NExecutableManager;
import org.apache.kylin.metadata.cube.model.LayoutPartition;
import org.apache.kylin.metadata.cube.model.NDataLayout;
import org.apache.kylin.metadata.cube.model.NDataSegDetails;
import org.apache.kylin.metadata.cube.model.NDataSegment;
import org.apache.kylin.metadata.cube.model.NDataflow;
import org.apache.kylin.metadata.cube.model.NDataflowManager;
import org.apache.kylin.metadata.model.NTableMetadataManager;
import org.apache.kylin.metadata.project.EnhancedUnitOfWork;
import org.apache.kylin.metadata.project.NProjectManager;
import org.apache.kylin.metadata.project.ProjectInstance;
import org.apache.kylin.tool.util.ProjectTemporaryTableCleanerHelper;

import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;

import io.kyligence.kap.guava20.shaded.common.io.ByteSource;
import io.kyligence.kap.guava20.shaded.common.util.concurrent.RateLimiter;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.NonNull;
import lombok.RequiredArgsConstructor;
import lombok.ToString;
import lombok.val;
import lombok.extern.slf4j.Slf4j;

@Slf4j
public class StorageCleaner {
    public static final String ANSI_RED = "\u001B[31m";
    public static final String ANSI_GREEN = "\u001B[32m";
    public static final String ANSI_YELLOW = "\u001B[33m";
    public static final String ANSI_BLUE = "\u001B[34m";
    public static final String ANSI_RESET = "\u001B[0m";

    private final boolean cleanup;
    private final boolean timeMachineEnabled;
    private final Collection projectNames;
    private final KylinConfig kylinConfig;

    // for s3 https://olapio.atlassian.net/browse/AL-3154
    private static final RateLimiter rateLimiter = RateLimiter.create(Integer.MAX_VALUE);

    @Getter
    private final Map trashRecord;
    private final ResourceStore resourceStore;

    public StorageCleaner() throws Exception {
        this(true);
    }

    public StorageCleaner(boolean cleanup) throws Exception {
        this(cleanup, Collections.emptyList());
    }

    public StorageCleaner(boolean cleanup, Collection projects) throws Exception {
        this.cleanup = cleanup;
        this.projectNames = projects;
        this.kylinConfig = KylinConfig.getInstanceFromEnv();
        this.timeMachineEnabled = kylinConfig.getTimeMachineEnabled();
        this.resourceStore = ResourceStore.getKylinMetaStore(KylinConfig.getInstanceFromEnv());
        val trashRecordResource = resourceStore.getResource(ResourceStore.METASTORE_TRASH_RECORD);
        this.trashRecord = trashRecordResource == null ? Maps.newHashMap()
                : JsonUtil.readValue(trashRecordResource.getByteSource().read(), TrashRecord.class).getTrashRecord();
    }

    public StorageCleaner(boolean cleanup, Collection projects, double requestFSRate, int tRetryTimes)
            throws Exception {
        this(cleanup, projects);
        if (requestFSRate > 0.0) {
            rateLimiter.setRate(requestFSRate);
        }
        if (tRetryTimes > 0) {
            FileSystemDecorator.retryTimes = tRetryTimes;
        }
    }

    @Getter
    private Set outdatedItems = Sets.newHashSet();

    private Set allFileSystems = Sets.newHashSet();

    public void execute() throws Exception {
        long start = System.currentTimeMillis();
        val config = KylinConfig.getInstanceFromEnv();
        long startTime = System.currentTimeMillis();

        val projects = NProjectManager.getInstance(config).listAllProjects().stream()
                .filter(projectInstance -> projectNames.isEmpty() || projectNames.contains(projectInstance.getName()))
                .collect(Collectors.toList());

        projects.stream().map(project -> NDataflowManager.getInstance(config, project.getName()).listAllDataflows())
                .flatMap(Collection::stream).map(dataflow -> KapConfig.wrap(dataflow.getConfig()))
                .map(KapConfig::getMetadataWorkingDirectory).forEach(hdfsWorkingDir -> {
                    val fs = HadoopUtil.getWorkingFileSystem();
                    allFileSystems.add(new StorageItem(FileSystemDecorator.getInstance(fs), hdfsWorkingDir));
                });
        allFileSystems.add(new StorageItem(FileSystemDecorator.getInstance(HadoopUtil.getWorkingFileSystem()),
                config.getHdfsWorkingDirectory()));
        // Check if independent storage of flat tables under read/write separation is enabled
        // For build tasks it is a project-level parameter(Higher project-level priority), but for cleaning up storage garbage,
        // WRITING_CLUSTER_WORKING_DIR is a system-level parameter
        if (kylinConfig.isBuildFilesSeparationEnabled()) {
            allFileSystems
                    .add(new StorageItem(FileSystemDecorator.getInstance(HadoopUtil.getWritingClusterFileSystem()),
                            config.getWritingClusterWorkingDir("")));
        }
        log.info("all file systems are {}", allFileSystems);
        for (StorageItem allFileSystem : allFileSystems) {
            log.debug("start to collect HDFS from {}", allFileSystem.getPath());
            collectFromHDFS(allFileSystem);
            log.debug("folder {} is collected,detailed -> {}", allFileSystem.getPath(), allFileSystems);
        }
        UnitOfWork.doInTransactionWithRetry(() -> {
            collectDeletedProject();
            for (ProjectInstance project : projects) {
                collect(project.getName());
            }
            return null;
        }, UnitOfWork.GLOBAL_UNIT);

        long configSurvivalTimeThreshold = timeMachineEnabled ? kylinConfig.getStorageResourceSurvivalTimeThreshold()
                : config.getCuboidLayoutSurvivalTimeThreshold();
        long protectionTime = startTime - configSurvivalTimeThreshold;
        for (StorageItem item : allFileSystems) {
            for (FileTreeNode node : item.getAllNodes()) {
                val path = new Path(item.getPath(), node.getRelativePath());
                if (timeMachineEnabled && trashRecord.get(path.toString()) == null) {
                    trashRecord.put(path.toString(), String.valueOf(startTime));
                    continue;
                }
                try {
                    log.debug("start to add item {}", path);
                    addItem(item.getFileSystemDecorator(), path, protectionTime);
                } catch (FileNotFoundException e) {
                    log.warn("{} not found", path);
                }
            }
        }
        boolean allSuccess = cleanup();
        printConsole(allSuccess, System.currentTimeMillis() - start);
    }

    public void printConsole(boolean success, long duration) {
        System.out.println(ANSI_BLUE + "Kylin 5.0 garbage report: (cleanup=" + cleanup + ")" + ANSI_RESET);
        for (StorageItem item : outdatedItems) {
            System.out.println("  Storage File: " + item.getPath());
        }
        String jobName = "Storage GC cleanup job ";
        if (!cleanup) {
            System.out.println(ANSI_BLUE + "Dry run mode, no data is deleted." + ANSI_RESET);
            jobName = "Storage GC check job ";
        }
        if (!success) {
            System.out.println(ANSI_RED + jobName + "FAILED." + ANSI_RESET);
            System.out.println(ANSI_RED + jobName + "finished in " + duration + " ms." + ANSI_RESET);
        } else {
            System.out.println(ANSI_GREEN + jobName + "SUCCEED." + ANSI_RESET);
            System.out.println(ANSI_GREEN + jobName + "finished in " + duration + " ms." + ANSI_RESET);
        }

    }

    public void collectDeletedProject() {
        val config = KylinConfig.getInstanceFromEnv();
        val projects = NProjectManager.getInstance(config).listAllProjects().stream().map(ProjectInstance::getName)
                .collect(Collectors.toSet());
        for (StorageItem item : allFileSystems) {
            item.getProjectNodes().removeIf(node -> projects.contains(node.getName()));
            log.info(String.valueOf(item.projectNodes.size()));
        }
    }

    public void collect(String project) {
        log.info("collect garbage for project: {}", project);
        new ProjectStorageCleaner(project).execute();
        log.info("clean temporary table for project: {}", project);
        new ProjectTemporaryTableCleaner(project).execute();
    }

    public boolean cleanup() throws Exception {
        boolean success = true;
        if (cleanup) {
            Stats stats = new Stats() {
                @Override
                public void heartBeat() {
                    double percent = 100D * (successItems.size() + errorItems.size()) / allItems.size();
                    String logInfo = String.format(Locale.ROOT, "Progress: %2.1f%%, %d resource, %d error", percent,
                            allItems.size(), errorItems.size());
                    System.out.println(logInfo);
                }
            };
            stats.onAllStart(outdatedItems);
            for (StorageItem item : outdatedItems) {
                log.debug("try to delete {}", item.getPath());
                if (Thread.currentThread().isInterrupted()) {
                    throw new InterruptedException();
                }
                try {
                    stats.onItemStart(item);
                    item.getFileSystemDecorator().delete(new Path(item.getPath()), true);
                    if (timeMachineEnabled) {
                        trashRecord.remove(item.getPath());
                    }
                    stats.onItemSuccess(item);
                } catch (IOException e) {
                    log.error("delete file " + item.getPath() + " failed", e);
                    stats.onItemError(item);
                    success = false;
                }
            }
            if (timeMachineEnabled) {
                EnhancedUnitOfWork.doInTransactionWithCheckAndRetry(() -> {
                    ResourceStore threadViewRS = ResourceStore.getKylinMetaStore(KylinConfig.getInstanceFromEnv());
                    RawResource raw = resourceStore.getResource(ResourceStore.METASTORE_TRASH_RECORD);
                    long mvcc = raw == null ? -1 : raw.getMvcc();
                    threadViewRS.checkAndPutResource(ResourceStore.METASTORE_TRASH_RECORD,
                            ByteSource.wrap(JsonUtil.writeValueAsBytes(new TrashRecord(trashRecord))), mvcc);
                    return 0;
                }, UnitOfWork.GLOBAL_UNIT, 1);
            }
        }
        return success;
    }

    private String getDataflowBaseDir(String project) {
        return project + PARQUET_STORAGE_ROOT + "/";
    }

    private String getDataflowDir(String project, String dataflowId) {
        return getDataflowBaseDir(project) + dataflowId;
    }

    private String getDfFlatTableDir(String project, String dataFlowId) {
        return project + FLAT_TABLE_STORAGE_ROOT + "/" + dataFlowId;
    }

    class ProjectStorageCleaner {

        private final String project;

        private final Set dependentFiles = Sets.newTreeSet();

        ProjectStorageCleaner(String project) {
            this.project = project;
        }

        public void execute() {
            collectJobTmp(project);
            collectDataflow(project);
            collectTable(project);

            for (StorageItem item : allFileSystems) {
                for (List nodes : item.getProject(project).getAllCandidates()) {
                    for (FileTreeNode node : nodes) {
                        log.debug("find candidate /{}", node.getRelativePath());
                    }
                }
            }
            for (String dependentFile : dependentFiles) {
                log.debug("remove candidate {}", dependentFile);
            }
            removeDependentFiles();
        }

        private void removeDependentFiles() {
            for (StorageItem item : allFileSystems) {
                for (List nodes : item.getProject(project).getAllCandidates()) {
                    // protect parent folder and
                    nodes.removeIf(
                            node -> dependentFiles.stream().anyMatch(df -> ("/" + node.getRelativePath()).startsWith(df)
                                    || df.startsWith("/" + node.getRelativePath())));
                }
            }
        }

        private void collectJobTmp(String project) {
            val config = KylinConfig.getInstanceFromEnv();
            val executableManager = NExecutableManager.getInstance(config, project);
            Set activeJobs = executableManager.getAllExecutables().stream()
                    .map(e -> project + JOB_TMP_ROOT + "/" + e.getId()).collect(Collectors.toSet());
            for (StorageItem item : allFileSystems) {
                item.getProject(project).getJobTmps().removeIf(node -> activeJobs.contains(node.getRelativePath()));
            }
        }

        private void collectDataflow(String project) {
            val config = KylinConfig.getInstanceFromEnv();
            val dataflowManager = NDataflowManager.getInstance(KylinConfig.getInstanceFromEnv(), project);
            val activeIndexDataPath = Sets. newHashSet();
            val activeBucketDataPath = Sets. newHashSet();
            val activeFastBitmapIndexDataPath = Sets. newHashSet();
            val activeSegmentFlatTableDataPath = Sets. newHashSet();
            val dataflows = NDataflowManager.getInstance(config, project).listAllDataflows().stream()
                    .map(RootPersistentEntity::getId).collect(Collectors.toSet());
            // set activeSegmentFlatTableDataPath, by iterating segments
            dataflowManager.listAllDataflows().forEach(df -> df.getSegments().stream() //
                    .map(segment -> getSegmentFlatTableDir(project, segment))
                    .forEach(activeSegmentFlatTableDataPath::add));
            //set activeIndexDataPath
            dataflowManager.listAllDataflows().forEach(dataflow -> dataflow.getSegments().stream() //
                    .flatMap(segment -> segment.getLayoutsMap().values().stream()) //
                    .forEach(layout -> {
                        activeIndexDataPath.add(getDataLayoutDir(layout));
                        layout.getMultiPartition().forEach(partition -> //
                        activeBucketDataPath.add(getDataPartitionDir(layout, partition)));
                    }));
            activeIndexDataPath
                    .forEach(path -> activeFastBitmapIndexDataPath.add(path + HadoopUtil.FAST_BITMAP_SUFFIX));
            val activeSegmentPath = activeIndexDataPath.stream().map(s -> new File(s).getParent())
                    .collect(Collectors.toSet());
            for (StorageCleaner.StorageItem item : allFileSystems) {
                item.getProject(project).getDataflows().removeIf(node -> dataflows.contains(node.getName()));
                item.getProject(project).getSegments()
                        .removeIf(node -> activeSegmentPath.contains(node.getRelativePath()));
                item.getProject(project).getLayouts()
                        .removeIf(node -> activeIndexDataPath.contains(node.getRelativePath())
                                || activeFastBitmapIndexDataPath.contains(node.getRelativePath()));
                item.getProject(project).getBuckets()
                        .removeIf(node -> activeBucketDataPath.contains(node.getRelativePath()));
                item.getProject(project).getDfFlatTables().removeIf(node -> dataflows.contains(node.getName()));
                item.getProject(project).getSegmentFlatTables()
                        .removeIf(node -> activeSegmentFlatTableDataPath.contains(node.getRelativePath()));
            }
        }

        private void collectTable(String project) {
            val config = KylinConfig.getInstanceFromEnv();
            val tableManager = NTableMetadataManager.getInstance(config, project);
            val activeDictDir = Sets. newHashSet();
            val activeTableExdDir = Sets. newHashSet();
            val activeDictTableDir = Sets. newHashSet();
            val activeSnapshotTableDir = Sets. newHashSet();
            val activeSnapshotDir = Sets. newHashSet();
            tableManager.listAllTables().forEach(table -> {
                Arrays.stream(table.getColumns())
                        .map(column -> getDictDir(project) + "/" + table.getIdentity() + "/" + column.getName())
                        .forEach(activeDictDir::add);
                activeTableExdDir.add(project + ResourceStore.TABLE_EXD_RESOURCE_ROOT + "/" + table.getIdentity());
                activeSnapshotTableDir.add(project + SNAPSHOT_STORAGE_ROOT + "/" + table.getIdentity());
                if (table.getLastSnapshotPath() != null) {
                    activeSnapshotDir.add(table.getLastSnapshotPath());
                }
                activeDictTableDir.add(getDictDir(project) + "/" + table.getIdentity());
            });

            for (StorageCleaner.StorageItem item : allFileSystems) {
                item.getProject(project).getGlobalDictTables()
                        .removeIf(node -> activeDictTableDir.contains(node.getRelativePath()));
                item.getProject(project).getGlobalDictColumns()
                        .removeIf(node -> activeDictDir.contains(node.getRelativePath()));
                item.getProject(project).getSnapshots()
                        .removeIf(node -> activeSnapshotDir.contains(node.getRelativePath()));
                item.getProject(project).getSnapshotTables()
                        .removeIf(node -> activeSnapshotTableDir.contains(node.getRelativePath()));
                item.getProject(project).getTableExds()
                        .removeIf(node -> activeTableExdDir.contains(node.getRelativePath()));
            }
        }
    }

    class ProjectTemporaryTableCleaner {
        private final String project;
        private CliCommandExecutor cliCommandExecutor;
        private ProjectTemporaryTableCleanerHelper tableCleanerHelper;

        ProjectTemporaryTableCleaner(String project) {
            this.project = project;
            this.cliCommandExecutor = new CliCommandExecutor();
            this.tableCleanerHelper = new ProjectTemporaryTableCleanerHelper();
        }

        public void execute() {
            List jobTemps = allFileSystems.iterator().next().getProject(project).getJobTmps();
            doExecuteCmd(collectDropTemporaryTransactionTable(jobTemps));
        }

        private void doExecuteCmd(String cmd) {
            try {
                CliCmdExecResult executeResult = cliCommandExecutor.execute(cmd, null);
                if (executeResult.getCode() != 0) {
                    log.error("execute drop intermediate table return fail, cmd : " + cmd);
                } else {
                    log.info("execute drop intermediate table succeeded, cmd: " + cmd);
                }
            } catch (ShellException e) {
                log.error("execute drop intermediate table error, cmd : " + cmd, e);
            }
        }

        public String collectDropTemporaryTransactionTable(List jobTemps) {
            String result = "";
            try {
                KylinConfig config = KylinConfig.getInstanceFromEnv();
                Set jobTempTables = jobTemps.stream()
                        .map(node -> tableCleanerHelper.getJobTransactionalTable(project, node.getName()))
                        .flatMap(Collection::stream).collect(Collectors.toSet());

                Set discardTempTables = NExecutableManager.getInstance(config, project)
                        .getExecutablesByStatus(ExecutableState.DISCARDED).stream()
                        .map(e -> tableCleanerHelper.getJobTransactionalTable(project, e.getId()))
                        .flatMap(Collection::stream).collect(Collectors.toSet());
                jobTempTables.addAll(discardTempTables);

                if (CollectionUtils.isNotEmpty(jobTempTables) && config.isReadTransactionalTableEnabled()) {
                    result = tableCleanerHelper.getDropTmpTableCmd(project, jobTempTables);
                }
            } catch (Exception exception) {
                log.error("Failed to delete temporary tables.", exception);
            }
            log.info("collectDropTemporaryTransactionTable end.");
            return result;
        }
    }

    private void addItem(FileSystemDecorator fs, Path itemPath, long protectionTime) throws IOException {
        val status = fs.getFileStatus(itemPath);
        if (status.getPath().getName().startsWith(".")) {
            return;
        }
        if (timeMachineEnabled && Long.parseLong(trashRecord.get(itemPath.toString())) > protectionTime) {
            return;
        }
        if (!timeMachineEnabled && status.getModificationTime() > protectionTime) {
            return;
        }

        outdatedItems.add(new StorageCleaner.StorageItem(fs, status.getPath().toString()));
    }

    private String getDictDir(String project) {
        return project + GLOBAL_DICT_STORAGE_ROOT;
    }

    private String getSegmentFlatTableDir(String project, NDataSegment segment) {
        return getDfFlatTableDir(project, segment.getDataflow().getId()) + "/" + segment.getId();
    }

    private String getDataLayoutDir(NDataLayout dataLayout) {
        NDataSegDetails segDetails = dataLayout.getSegDetails();
        return getDataflowDir(segDetails.getProject(), segDetails.getDataSegment().getDataflow().getId()) + "/"
                + segDetails.getUuid() + "/" + dataLayout.getLayoutId();
    }

    private String getDataPartitionDir(NDataLayout dataLayout, LayoutPartition dataPartition) {
        return getDataLayoutDir(dataLayout) + "/" + dataPartition.getBucketId();
    }

    private void collectFromHDFS(StorageItem item) throws Exception {
        val projectFolders = item.getFileSystemDecorator().listStatus(new Path(item.getPath()),
                path -> !path.getName().startsWith("_")
                        && (this.projectNames.isEmpty() || this.projectNames.contains(path.getName())));
        for (FileStatus projectFolder : projectFolders) {
            List tableSnapshotParents = Lists.newArrayList();
            val projectNode = new ProjectFileTreeNode(projectFolder.getPath().getName());
            for (Pair> pair : Arrays.asList(
                    Pair.newPair(JOB_TMP_ROOT.substring(1), projectNode.getJobTmps()),
                    Pair.newPair(GLOBAL_DICT_STORAGE_ROOT.substring(1), projectNode.getGlobalDictTables()),
                    Pair.newPair(PARQUET_STORAGE_ROOT.substring(1), projectNode.getDataflows()),
                    Pair.newPair(TABLE_EXD_STORAGE_ROOT.substring(1), projectNode.getTableExds()),
                    Pair.newPair(SNAPSHOT_STORAGE_ROOT.substring(1), tableSnapshotParents),
                    Pair.newPair(FLAT_TABLE_STORAGE_ROOT.substring(1), projectNode.getDfFlatTables()))) {
                val treeNode = new FileTreeNode(pair.getFirst(), projectNode);
                try {
                    log.debug("collect files from {}", pair.getFirst());
                    Stream.of(item.getFileSystemDecorator()
                            .listStatus(new Path(item.getPath(), treeNode.getRelativePath())))
                            .forEach(x -> pair.getSecond().add(new FileTreeNode(x.getPath().getName(), treeNode)));
                } catch (FileNotFoundException e) {
                    log.info("folder {} not found", new Path(item.getPath(), treeNode.getRelativePath()));
                }
            }
            item.getProjectNodes().add(projectNode);
            item.getProjects().put(projectNode.getName(), projectNode);
            for (Pair, List> pair : Arrays.asList(
                    Pair.newPair(tableSnapshotParents, projectNode.getSnapshots()), //
                    Pair.newPair(projectNode.getGlobalDictTables(), projectNode.getGlobalDictColumns()), //
                    Pair.newPair(projectNode.getDataflows(), projectNode.getSegments()), //
                    Pair.newPair(projectNode.getSegments(), projectNode.getLayouts()),
                    Pair.newPair(projectNode.getDfFlatTables(), projectNode.getSegmentFlatTables()))) {
                val slot = pair.getSecond();
                for (FileTreeNode node : pair.getFirst()) {
                    log.debug("collect from {} -> {}", node.getName(), node);
                    Stream.of(
                            item.getFileSystemDecorator().listStatus(new Path(item.getPath(), node.getRelativePath())))
                            .forEach(x -> slot.add(new FileTreeNode(x.getPath().getName(), node)));
                }
            }
            projectNode.getBuckets().addAll(collectMultiPartitions(item, projectNode.getName(), projectNode.getLayouts()));
        }

    }

    private List collectMultiPartitions(StorageItem item, String project, List layouts)
            throws IOException {
        NDataflowManager manager = NDataflowManager.getInstance(kylinConfig, project);
        FileSystemDecorator fileSystemDecorator = item.getFileSystemDecorator();
        String itemPath = item.getPath();
        List result = Lists.newArrayList();
        HashSet cached = Sets.newHashSet();
        // Buckets do not certainly exist.
        // Only multi level partition model should do this.
        for (FileTreeNode node : layouts) {
            String dataflowId = node.getParent().getParent().getName(); // dataflow
            if (cached.contains(dataflowId)) {
                continue;
            }
            NDataflow dataflow = manager.getDataflow(dataflowId);
            if (Objects.nonNull(dataflow) //
                    && Objects.nonNull(dataflow.getModel()) //
                    && dataflow.getModel().isMultiPartitionModel()) {
                cached.add(dataflowId);
                result.addAll(Stream.of(fileSystemDecorator.listStatus(new Path(itemPath, node.getRelativePath())))
                        .filter(FileStatus::isDirectory) // Essential check in case of bad design.
                        .map(x -> new FileTreeNode(x.getPath().getName(), node)).collect(Collectors.toList()));
            } else {
                cached.add(dataflowId);
            }
        }
        return result;
    }

    @AllArgsConstructor
    public static class FileSystemDecorator {
        @NonNull
        private FileSystem fs;

        private static int retryTimes = 3;

        interface Action {
            T run() throws IOException;
        }

        private  E sleepAndRetry(Action action) throws IOException {
            rateLimiter.acquire();

            for (int i = 0; i < retryTimes - 1; i++) {
                try {
                    return action.run();
                } catch (FileNotFoundException e) {
                    throw e;
                } catch (Exception e) {
                    log.error("Failed to use fs api!", e);
                }

                try {
                    Thread.sleep(1000);
                } catch (InterruptedException ie) {
                    log.error("Failed to sleep!", ie);
                    Thread.currentThread().interrupt();
                }
            }

            return action.run();
        }

        public static FileSystemDecorator getInstance(FileSystem fs) {
            return new FileSystemDecorator(fs);
        }

        public FileStatus[] listStatus(Path f) throws IOException {
            return sleepAndRetry(() -> fs.listStatus(f));
        }

        public FileStatus[] listStatus(Path f, PathFilter filter) throws IOException {
            return sleepAndRetry(() -> fs.listStatus(f, filter));
        }

        public FileStatus getFileStatus(Path f) throws IOException {
            return sleepAndRetry(() -> fs.getFileStatus(f));
        }

        public boolean delete(Path f, boolean recursive) throws IOException {
            return sleepAndRetry(() -> fs.delete(f, recursive));
        }
    }

    @Data
    @RequiredArgsConstructor
    @AllArgsConstructor
    public static class StorageItem {

        @NonNull
        private FileSystemDecorator fileSystemDecorator;

        @NonNull
        private String path;

        /**
         * File hierarchy is
         *
         * /working_dir
         * |--/${project_name}
         *    |--/parquet
         *    |  +--/${dataflow_id}
         *    |     +--/${segment_id}
         *    |        +--/${layout_id}
         *    |           +--/${bucket_id} if multi level partition enabled.
         *    |        +--/${layout_id}_fast_bitmap  if enabled
         *    |--/job_tmp
         *    |  +--/${job_id}
         *    |--/table_exd
         *    |  +--/${table_identity}
         *    |--/dict/global_dict
         *    |  +--/${table_identity}
         *    |     +--/${column_name}
         *    |--/table_snapshot
         *    |  +--/${table_identity}
         *    |     +--/${snapshot_version}
         *    |--/flat_table
         *    |  +--/${dataflow_id}
         *    |     +--/${segment_id}
         */

        List projectNodes = Lists.newArrayList();

        Map projects = Maps.newHashMap();

        List getAllNodes() {
            val allNodes = projects.values().stream().flatMap(p -> p.getAllCandidates().stream())
                    .flatMap(Collection::stream).collect(Collectors.toList());
            allNodes.addAll(projectNodes);
            return allNodes;
        }

        ProjectFileTreeNode getProject(String name) {
            return projects.getOrDefault(name, new ProjectFileTreeNode(name));
        }

        @Override
        public boolean equals(Object o) {
            if (this == o) {
                return true;
            }
            if (o == null || getClass() != o.getClass()) {
                return false;
            }
            StorageItem that = (StorageItem) o;
            return Objects.equals(fileSystemDecorator.fs, that.fileSystemDecorator.fs)
                    && Objects.equals(path, that.path);
        }

        @Override
        public int hashCode() {
            return Objects.hash(fileSystemDecorator.fs, path);
        }
    }

    @Data
    @NoArgsConstructor
    @AllArgsConstructor
    @RequiredArgsConstructor
    public static class FileTreeNode {

        @NonNull
        String name;

        FileTreeNode parent;

        public String getRelativePath() {
            if (parent == null) {
                return name;
            }
            return parent.getRelativePath() + "/" + name;
        }
    }

    @Data
    @EqualsAndHashCode(callSuper = true)
    @ToString(onlyExplicitlyIncluded = true, callSuper = true)
    public static class ProjectFileTreeNode extends FileTreeNode {

        public ProjectFileTreeNode(String name) {
            super(name);
        }

        List jobTmps = Lists.newLinkedList();

        List tableExds = Lists.newLinkedList();

        List globalDictTables = Lists.newLinkedList();

        List globalDictColumns = Lists.newLinkedList();

        List snapshotTables = Lists.newLinkedList();

        List snapshots = Lists.newLinkedList();

        List dataflows = Lists.newLinkedList();

        List segments = Lists.newLinkedList();

        List layouts = Lists.newLinkedList();

        List buckets = Lists.newLinkedList();

        List dfFlatTables = Lists.newArrayList();

        List segmentFlatTables = Lists.newArrayList();

        Collection> getAllCandidates() {
            return Arrays.asList(jobTmps, tableExds, globalDictTables, globalDictColumns, snapshotTables, snapshots,
                    dataflows, segments, layouts, buckets, dfFlatTables, segmentFlatTables);
        }

    }

    public static class Stats {

        public final Set allItems = Collections.synchronizedSet(new HashSet<>());
        public final Set startItem = Collections.synchronizedSet(new HashSet<>());
        public final Set successItems = Collections.synchronizedSet(new HashSet<>());
        public final Set errorItems = Collections.synchronizedSet(new HashSet<>());

        private void reset() {
            allItems.clear();
            startItem.clear();
            successItems.clear();
            errorItems.clear();
        }

        void onAllStart(Set outDatedItems) {
            // retry enters here too, reset everything first
            reset();

            log.debug("{} items to cleanup", outDatedItems.size());
            allItems.addAll(outDatedItems);
        }

        void onItemStart(StorageItem item) {
            heartBeat();
            startItem.add(item);
        }

        void onItemError(StorageItem item) {
            errorItems.add(item);
        }

        void onItemSuccess(StorageItem item) {
            successItems.add(item);
        }

        public void onRetry() {
            // for progress printing
        }

        public void heartBeat() {
            // for progress printing
        }

        public boolean hasError() {
            return !errorItems.isEmpty();
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy