All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.kylin.rest.job.MetadataCleanupJob Maven / Gradle / Ivy

There is a newer version: 4.0.4
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *  
 *     http://www.apache.org/licenses/LICENSE-2.0
 *  
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.kylin.rest.job;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.NavigableSet;
import java.util.Set;
import java.util.TreeSet;

import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.kylin.common.KylinConfig;
import org.apache.kylin.common.persistence.ResourceStore;
import org.apache.kylin.common.util.HadoopUtil;
import org.apache.kylin.cube.CubeInstance;
import org.apache.kylin.cube.CubeManager;
import org.apache.kylin.cube.CubeSegment;
import org.apache.kylin.dict.DictionaryInfo;
import org.apache.kylin.dict.DictionaryInfoSerializer;
import org.apache.kylin.job.dao.ExecutableDao;
import org.apache.kylin.job.dao.ExecutableOutputPO;
import org.apache.kylin.job.dao.ExecutablePO;
import org.apache.kylin.job.exception.PersistentException;
import org.apache.kylin.job.execution.ExecutableState;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.collect.Maps;
import com.google.common.collect.Sets;

public class MetadataCleanupJob {

    private static final Logger logger = LoggerFactory.getLogger(MetadataCleanupJob.class);

    private static final long NEW_RESOURCE_THREADSHOLD_MS = 12 * 3600 * 1000L; // 12 hour

    // ============================================================================

    final KylinConfig config;

    private Map garbageResources = Maps.newHashMap();
    private ResourceStore store;

    public MetadataCleanupJob() {
        this(KylinConfig.getInstanceFromEnv());
    }

    public MetadataCleanupJob(KylinConfig config) {
        this.config = config;
        this.store = ResourceStore.getStore(config);
    }

    public Map getGarbageResources() {
        return garbageResources;
    }

    // function entrance
    public Map cleanup(boolean delete, int jobOutdatedDays) throws Exception {
        CubeManager cubeManager = CubeManager.getInstance(config);
        long newResourceTimeCut = System.currentTimeMillis() - NEW_RESOURCE_THREADSHOLD_MS;
        FileSystem fs = HadoopUtil.getWorkingFileSystem(HadoopUtil.getCurrentConfiguration());

        Map toDeleteCandidates = Maps.newHashMap();

        // two level resources, snapshot tables and cube statistics
        for (String resourceRoot : new String[] { ResourceStore.SNAPSHOT_RESOURCE_ROOT,
                ResourceStore.CUBE_STATISTICS_ROOT, ResourceStore.EXT_SNAPSHOT_RESOURCE_ROOT }) {
            for (String dir : noNull(store.listResources(resourceRoot))) {
                for (String res : noNull(store.listResources(dir))) {
                    long timestamp = getTimestamp(res);
                    if (timestamp < newResourceTimeCut)
                        toDeleteCandidates.put(res, timestamp);
                }
            }
        }

        // find all of the global dictionaries in HDFS
        try {
            FileStatus[] fStatus = new FileStatus[0];
            fStatus = ArrayUtils.addAll(fStatus, fs.listStatus(new Path(
                    KylinConfig.getInstanceFromEnv().getHdfsWorkingDirectory() + "resources/GlobalDict/dict")));
            fStatus = ArrayUtils.addAll(fStatus, fs.listStatus(new Path(
                    KylinConfig.getInstanceFromEnv().getHdfsWorkingDirectory() + "resources/SegmentDict/dict")));
            for (FileStatus status : fStatus) {
                String path = status.getPath().toString();
                FileStatus[] globalDicts = fs.listStatus(new Path(path));
                for (FileStatus globalDict : globalDicts) {
                    String globalDictPath = globalDict.getPath().toString();
                    long timestamp = getTimestamp(globalDict);
                    if (timestamp < newResourceTimeCut)
                        toDeleteCandidates.put(globalDictPath, timestamp);
                }
            }
        } catch (FileNotFoundException e) {
            logger.info("Working Directory does not exist on HDFS. ");
        }

        // three level resources, only dictionaries
        for (String resourceRoot : new String[] { ResourceStore.DICT_RESOURCE_ROOT }) {
            for (String dir : noNull(store.listResources(resourceRoot))) {
                for (String dir2 : noNull(store.listResources(dir))) {
                    for (String res : noNull(store.listResources(dir2))) {
                        long timestamp = getTimestamp(res);
                        if (timestamp < newResourceTimeCut)
                            toDeleteCandidates.put(res, timestamp);
                    }
                }
            }
        }

        // exclude resources in use
        Set activeResources = Sets.newHashSet();
        for (CubeInstance cube : cubeManager.reloadAndListAllCubes()) {
            activeResources.addAll(cube.getSnapshots().values());
            for (CubeSegment segment : cube.getSegments()) {
                activeResources.addAll(segment.getSnapshotPaths());
                activeResources.addAll(segment.getDictionaryPaths());
                activeResources.add(segment.getStatisticsResourcePath());
                for (String dictPath : segment.getDictionaryPaths()) {
                    DictionaryInfo dictInfo = store.getResource(dictPath, DictionaryInfoSerializer.FULL_SERIALIZER);
                    if ("org.apache.kylin.dict.AppendTrieDictionary"
                            .equals(dictInfo != null ? dictInfo.getDictionaryClass() : null)) {
                        String dictObj = dictInfo.getDictionaryObject().toString();
                        String basedir = dictObj.substring(dictObj.indexOf("(") + 1, dictObj.indexOf(")") - 1);
                        if (basedir.startsWith(
                                KylinConfig.getInstanceFromEnv().getHdfsWorkingDirectory() + "/resources/GlobalDict")) {
                            activeResources.add(KylinConfig.getInstanceFromEnv().getHdfsWorkingDirectory()
                                    + "resources/GlobalDict" + dictInfo.getResourceDir());
                        } else if (basedir.startsWith(KylinConfig.getInstanceFromEnv().getHdfsWorkingDirectory()
                                + "/resources/SegmentDict")) {
                            activeResources.add(KylinConfig.getInstanceFromEnv().getHdfsWorkingDirectory()
                                    + "resources/SegmentDict" + dictInfo.getResourceDir());
                        }
                    }
                }
            }
        }
        toDeleteCandidates.keySet().removeAll(activeResources);

        // delete old and completed jobs
        long outdatedJobTimeCut = System.currentTimeMillis() - jobOutdatedDays * 24 * 3600 * 1000L;
        ExecutableDao executableDao = ExecutableDao.getInstance(config);
        List allExecutable = executableDao.getJobs();
        for (ExecutablePO executable : allExecutable) {
            long lastModified = executable.getLastModified();
            if (lastModified < outdatedJobTimeCut && isJobComplete(executableDao, executable)) {
                String jobResPath = ResourceStore.EXECUTE_RESOURCE_ROOT + "/" + executable.getUuid();
                String jobOutputResPath = ResourceStore.EXECUTE_OUTPUT_RESOURCE_ROOT + "/" + executable.getUuid();
                long outputLastModified = getTimestamp(jobOutputResPath);
                toDeleteCandidates.put(jobResPath, lastModified);
                toDeleteCandidates.put(jobOutputResPath, outputLastModified);

                List tasks = executable.getTasks();
                if (tasks != null && !tasks.isEmpty()) {
                    for (ExecutablePO task : executable.getTasks()) {
                        String taskId = task.getUuid();
                        if (StringUtils.isNotBlank(taskId)) {
                            String resPath = ResourceStore.EXECUTE_OUTPUT_RESOURCE_ROOT + "/" + task.getUuid();
                            long timestamp = getTimestamp(resPath);
                            toDeleteCandidates.put(resPath, timestamp);
                        }
                    }
                }
            }
        }

        garbageResources = cleanupConclude(delete, toDeleteCandidates);
        return garbageResources;
    }

    private boolean isJobComplete(ExecutableDao executableDao, ExecutablePO job) {
        String jobId = job.getUuid();
        boolean isComplete = false;
        try {
            ExecutableOutputPO output = executableDao.getJobOutput(jobId);
            String status = output.getStatus();
            if (StringUtils.equals(status, ExecutableState.SUCCEED.toString())
                    || StringUtils.equals(status, ExecutableState.DISCARDED.toString())) {
                isComplete = true;
            }
        } catch (PersistentException e) {
            logger.error("Get job output failed for job uuid: {}", jobId, e);
            isComplete = true; // job output broken --> will be treat as complete
        }

        return isComplete;
    }

    private Map cleanupConclude(boolean delete, Map toDeleteResources) throws IOException {
        if (toDeleteResources.isEmpty()) {
            logger.info("No metadata resource to clean up");
            return toDeleteResources;
        }

        logger.info("{} metadata resource to clean up", toDeleteResources.size());

        if (delete) {
            ResourceStore store = ResourceStore.getStore(config);
            FileSystem fs = HadoopUtil.getWorkingFileSystem(HadoopUtil.getCurrentConfiguration());
            for (String res : toDeleteResources.keySet()) {
                long timestamp = toDeleteResources.get(res);
                logger.info("Deleting metadata=[resource_path: {}, timestamp: {}]", res, timestamp);
                try {
                    if (res.startsWith(KylinConfig.getInstanceFromEnv().getHdfsWorkingDirectory())) {
                        fs.delete(new Path(res), true);
                    } else {
                        store.deleteResource(res, timestamp);
                    }
                } catch (IOException e) {
                    logger.error("Failed to delete metadata=[resource_path: {}, timestamp: {}] ", res, timestamp, e);
                }
            }
        } else {
            for (String res : toDeleteResources.keySet()) {
                long timestamp = toDeleteResources.get(res);
                logger.info("Dry run, pending delete metadata=[resource_path: {}, timestamp: {}] ", res, timestamp);
            }
        }
        return toDeleteResources;
    }

    private NavigableSet noNull(NavigableSet list) {
        return (list == null) ? new TreeSet<>() : list;
    }

    private long getTimestamp(String resPath) {
        long timestamp = Long.MAX_VALUE;
        try {
            timestamp = store.getResourceTimestamp(resPath);
        } catch (IOException e) {
            logger.warn("Failed to get resource timestamp from remote resource store, details:{}", e);
        }
        return timestamp;
    }

    private long getTimestamp(FileStatus filestatus) {
        return filestatus.getModificationTime();
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy