All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.python.util.PythonDependencyUtils Maven / Gradle / Ivy

There is a newer version: 1.14.6
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.python.util;

import org.apache.flink.annotation.Internal;
import org.apache.flink.api.common.cache.DistributedCache;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.ConfigOption;
import org.apache.flink.configuration.ConfigOptions;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.configuration.ReadableConfig;
import org.apache.flink.python.PythonOptions;
import org.apache.flink.util.Preconditions;
import org.apache.flink.util.StringUtils;

import org.apache.commons.cli.CommandLine;

import javax.annotation.Nullable;

import java.io.File;
import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

import static org.apache.flink.client.cli.CliFrontendParser.PYARCHIVE_OPTION;
import static org.apache.flink.client.cli.CliFrontendParser.PYEXEC_OPTION;
import static org.apache.flink.client.cli.CliFrontendParser.PYFILES_OPTION;
import static org.apache.flink.client.cli.CliFrontendParser.PYREQUIREMENTS_OPTION;
import static org.apache.flink.python.PythonOptions.PYTHON_CLIENT_EXECUTABLE;
import static org.apache.flink.python.PythonOptions.PYTHON_EXECUTABLE;

/**
 * Utility class for Python dependency management. The dependencies will be registered at the
 * distributed cache.
 */
@Internal
public class PythonDependencyUtils {

    public static final String FILE = "file";
    public static final String CACHE = "cache";
    public static final String FILE_DELIMITER = ",";
    public static final String PARAM_DELIMITER = "#";
    private static final String HASH_ALGORITHM = "SHA-256";

    // Internal Python Config Options.

    public static final ConfigOption> PYTHON_FILES =
            ConfigOptions.key("python.internal.files-key-map").mapType().noDefaultValue();
    public static final ConfigOption> PYTHON_REQUIREMENTS_FILE =
            ConfigOptions.key("python.internal.requirements-file-key").mapType().noDefaultValue();
    public static final ConfigOption> PYTHON_ARCHIVES =
            ConfigOptions.key("python.internal.archives-key-map").mapType().noDefaultValue();

    /**
     * Adds python dependencies to registered cache file list according to given configuration and
     * returns a new configuration which contains the metadata of the registered python
     * dependencies.
     *
     * @param cachedFiles The list used to store registered cached files.
     * @param config The configuration which contains python dependency configuration.
     * @return A new configuration which contains the metadata of the registered python
     *     dependencies.
     */
    public static Configuration configurePythonDependencies(
            List> cachedFiles,
            Configuration config) {
        PythonDependencyManager pythonDependencyManager =
                new PythonDependencyManager(cachedFiles, config);
        return pythonDependencyManager.getConfigWithPythonDependencyOptions();
    }

    public static Configuration parsePythonDependencyConfiguration(CommandLine commandLine) {
        Configuration config = new Configuration();
        if (commandLine.hasOption(PYFILES_OPTION.getOpt())) {
            config.set(
                    PythonOptions.PYTHON_FILES,
                    commandLine.getOptionValue(PYFILES_OPTION.getOpt()));
        }
        if (commandLine.hasOption(PYREQUIREMENTS_OPTION.getOpt())) {
            config.set(
                    PythonOptions.PYTHON_REQUIREMENTS,
                    commandLine.getOptionValue(PYREQUIREMENTS_OPTION.getOpt()));
        }
        if (commandLine.hasOption(PYARCHIVE_OPTION.getOpt())) {
            config.set(
                    PythonOptions.PYTHON_ARCHIVES,
                    commandLine.getOptionValue(PYARCHIVE_OPTION.getOpt()));
        }
        if (commandLine.hasOption(PYEXEC_OPTION.getOpt())) {
            config.set(
                    PythonOptions.PYTHON_EXECUTABLE,
                    commandLine.getOptionValue(PYEXEC_OPTION.getOpt()));
        }
        return config;
    }

    public static void merge(Configuration config, Configuration pythonConfiguration) {
        Configuration toMerge = new Configuration(pythonConfiguration);
        if (toMerge.contains(PythonOptions.PYTHON_FILES)) {
            if (config.contains(PythonOptions.PYTHON_FILES)) {
                config.set(
                        PythonOptions.PYTHON_FILES,
                        String.join(
                                FILE_DELIMITER,
                                toMerge.get(PythonOptions.PYTHON_FILES),
                                config.get(PythonOptions.PYTHON_FILES)));
            } else {
                config.set(PythonOptions.PYTHON_FILES, toMerge.get(PythonOptions.PYTHON_FILES));
            }
            toMerge.removeConfig(PythonOptions.PYTHON_FILES);
        }
        if (toMerge.contains(PythonOptions.PYTHON_ARCHIVES)) {
            if (config.contains(PythonOptions.PYTHON_ARCHIVES)) {
                config.set(
                        PythonOptions.PYTHON_ARCHIVES,
                        String.join(
                                FILE_DELIMITER,
                                toMerge.get(PythonOptions.PYTHON_ARCHIVES),
                                config.get(PythonOptions.PYTHON_ARCHIVES)));
            } else {
                config.set(
                        PythonOptions.PYTHON_ARCHIVES, toMerge.get(PythonOptions.PYTHON_ARCHIVES));
            }
            toMerge.removeConfig(PythonOptions.PYTHON_ARCHIVES);
        }
        config.addAll(toMerge);
    }

    /** Helper class for Python dependency management. */
    private static class PythonDependencyManager {

        private static final String PYTHON_FILE_PREFIX = "python_file";
        private static final String PYTHON_REQUIREMENTS_FILE_PREFIX = "python_requirements_file";
        private static final String PYTHON_REQUIREMENTS_CACHE_PREFIX = "python_requirements_cache";
        private static final String PYTHON_ARCHIVE_PREFIX = "python_archive";

        private final List> cachedFiles;
        private final Configuration internalConfig;

        private PythonDependencyManager(
                List> cachedFiles,
                Configuration config) {
            this.cachedFiles = cachedFiles;
            this.internalConfig = new Configuration(config);
            configure(config);
        }

        /**
         * Adds a Python dependency which could be .py files, Python packages(.zip, .egg etc.) or
         * local directories. The dependencies will be added to the PYTHONPATH of the Python UDF
         * worker and the local Py4J python client.
         *
         * @param filePath The path of the Python dependency.
         */
        private void addPythonFile(String filePath) {
            Preconditions.checkNotNull(filePath);
            String fileKey = generateUniqueFileKey(PYTHON_FILE_PREFIX, filePath);
            registerCachedFileIfNotExist(filePath, fileKey);
            if (!internalConfig.contains(PYTHON_FILES)) {
                internalConfig.set(PYTHON_FILES, new LinkedHashMap<>());
            }
            internalConfig.get(PYTHON_FILES).put(fileKey, new File(filePath).getName());
        }

        /**
         * Specifies the third-party dependencies via a requirements file. These dependencies will
         * be installed by the command "pip install -r [requirements file]" before launching the
         * Python UDF worker.
         *
         * @param requirementsFilePath The path of the requirements file.
         */
        private void setPythonRequirements(String requirementsFilePath) {
            setPythonRequirements(requirementsFilePath, null);
        }

        /**
         * Specifies the third-party dependencies via a requirements file. The
         * `requirementsCachedDir` will be uploaded to support offline installation. These
         * dependencies will be installed by the command "pip install -r [requirements file]
         * --find-links [requirements cached dir]" before launching the Python UDF worker.
         *
         * @param requirementsFilePath The path of the requirements file.
         * @param requirementsCachedDir The path of the requirements cached directory.
         */
        private void setPythonRequirements(
                String requirementsFilePath, @Nullable String requirementsCachedDir) {
            Preconditions.checkNotNull(requirementsFilePath);
            if (!internalConfig.contains(PYTHON_REQUIREMENTS_FILE)) {
                internalConfig.set(PYTHON_REQUIREMENTS_FILE, new HashMap<>());
            }
            internalConfig.get(PYTHON_REQUIREMENTS_FILE).clear();
            removeCachedFilesByPrefix(PYTHON_REQUIREMENTS_FILE_PREFIX);
            removeCachedFilesByPrefix(PYTHON_REQUIREMENTS_CACHE_PREFIX);

            String fileKey =
                    generateUniqueFileKey(PYTHON_REQUIREMENTS_FILE_PREFIX, requirementsFilePath);
            registerCachedFileIfNotExist(requirementsFilePath, fileKey);
            internalConfig.get(PYTHON_REQUIREMENTS_FILE).put(FILE, fileKey);

            if (requirementsCachedDir != null) {
                String cacheDirKey =
                        generateUniqueFileKey(
                                PYTHON_REQUIREMENTS_CACHE_PREFIX, requirementsCachedDir);
                registerCachedFileIfNotExist(requirementsCachedDir, cacheDirKey);
                internalConfig.get(PYTHON_REQUIREMENTS_FILE).put(CACHE, cacheDirKey);
            }
        }

        /**
         * Adds a Python archive file (zip format). The file will be extracted and moved to a
         * dedicated directory under the working directory of the Python UDF workers. The param
         * `targetDir` is the name of the dedicated directory. The Python UDFs and the config option
         * "python.executable" could access the extracted files via relative path.
         *
         * @param archivePath The path of the archive file.
         * @param targetDir The name of the target directory.
         */
        private void addPythonArchive(String archivePath, @Nullable String targetDir) {
            Preconditions.checkNotNull(archivePath);
            if (!internalConfig.contains(PYTHON_ARCHIVES)) {
                internalConfig.set(PYTHON_ARCHIVES, new HashMap<>());
            }
            if (targetDir == null) {
                targetDir = new File(archivePath).getName();
            }
            String fileKey =
                    generateUniqueFileKey(
                            PYTHON_ARCHIVE_PREFIX, archivePath + PARAM_DELIMITER + targetDir);
            registerCachedFileIfNotExist(archivePath, fileKey);
            internalConfig.get(PYTHON_ARCHIVES).put(fileKey, targetDir);
        }

        private void configure(ReadableConfig config) {
            config.getOptional(PythonOptions.PYTHON_FILES)
                    .ifPresent(
                            pyFiles -> {
                                for (String filePath : pyFiles.split(FILE_DELIMITER)) {
                                    addPythonFile(filePath);
                                }
                            });

            config.getOptional(PythonOptions.PYTHON_REQUIREMENTS)
                    .ifPresent(
                            pyRequirements -> {
                                if (pyRequirements.contains(PARAM_DELIMITER)) {
                                    String[] requirementFileAndCache =
                                            pyRequirements.split(PARAM_DELIMITER, 2);
                                    setPythonRequirements(
                                            requirementFileAndCache[0], requirementFileAndCache[1]);
                                } else {
                                    setPythonRequirements(pyRequirements);
                                }
                            });

            config.getOptional(PythonOptions.PYTHON_ARCHIVES)
                    .ifPresent(
                            pyArchives -> {
                                for (String archive : pyArchives.split(FILE_DELIMITER)) {
                                    String archivePath;
                                    String targetDir;
                                    if (archive.contains(PARAM_DELIMITER)) {
                                        String[] filePathAndTargetDir =
                                                archive.split(PARAM_DELIMITER, 2);
                                        archivePath = filePathAndTargetDir[0];
                                        targetDir = filePathAndTargetDir[1];
                                    } else {
                                        archivePath = archive;
                                        targetDir = null;
                                    }
                                    addPythonArchive(archivePath, targetDir);
                                }
                            });

            config.getOptional(PYTHON_EXECUTABLE)
                    .ifPresent(e -> internalConfig.set(PYTHON_EXECUTABLE, e));

            config.getOptional(PYTHON_CLIENT_EXECUTABLE)
                    .ifPresent(e -> internalConfig.set(PYTHON_CLIENT_EXECUTABLE, e));
        }

        private String generateUniqueFileKey(String prefix, String hashString) {
            MessageDigest messageDigest;
            try {
                messageDigest = MessageDigest.getInstance(HASH_ALGORITHM);
            } catch (NoSuchAlgorithmException e) {
                throw new RuntimeException(e);
            }
            messageDigest.update(hashString.getBytes(StandardCharsets.UTF_8));

            return String.format(
                    "%s_%s", prefix, StringUtils.byteToHexString(messageDigest.digest()));
        }

        private void registerCachedFileIfNotExist(String filePath, String fileKey) {
            if (cachedFiles.stream().noneMatch(t -> t.f0.equals(fileKey))) {
                cachedFiles.add(
                        new Tuple2<>(
                                fileKey,
                                new DistributedCache.DistributedCacheEntry(filePath, false)));
            }
        }

        private void removeCachedFilesByPrefix(String prefix) {
            cachedFiles.removeAll(
                    cachedFiles.stream()
                            .filter(t -> t.f0.matches("^" + prefix + "_[a-z0-9]{64}$"))
                            .collect(Collectors.toSet()));
        }

        private Configuration getConfigWithPythonDependencyOptions() {
            return internalConfig;
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy