All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hazelcast.jet.python.PythonServiceConfig Maven / Gradle / Ivy

/*
 * Copyright 2020 Hazelcast Inc.
 *
 * Licensed under the Hazelcast Community License (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://hazelcast.com/hazelcast-community-license
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.hazelcast.jet.python;

import com.hazelcast.jet.pipeline.GeneralStage;

import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.util.StringJoiner;

/**
 * Configuration object for the Python service factory, used in a
 * {@link PythonTransforms#mapUsingPython mapUsingPython} stage.
 * 

* Hazelcast Jet expects you to have a Python project in a local directory. * It must contain the definition of a {@code transform_list()} function * that receives a list of strings and returns a list of strings of the * same size, with a one-to-one mapping between input and output elements. * Here's a simple example of a function that transforms every input * string by prepending {@code "echo-"} to it: *

{@code
 * def transform_list(input_list):
 *     return ["echo-%s" % i for i in input_list]
 * }
* If you have a very simple setup with everything in a single Python file, * you can use {@link #setHandlerFile}. Let's say you saved the above * Python code to a file named {@code echo.py}. You can use it from Jet * like this: *
{@code
 * StreamStage inputStage = createInputStage();
 * StreamStage outputStage = inputStage.apply(
 *         mapUsingPython(new PythonServiceConfig()
 *                 .setHandlerFile("path/to/echo.py")));
 * }
* In more complex setups you can tell Jet the location of your project * {@linkplain #setBaseDir directory} and the name of the Python {@linkplain * #setHandlerModule module} containing {@code transform_list()}. You can * also use a {@linkplain #setHandlerFunction different name} for the * function. *

* Jet uploads the entire directory to the cluster, creates one or more * Python processes on each member, and sends the pipeline data through * your function. The number of processes is controlled by the {@linkplain * GeneralStage#setLocalParallelism local parallelism} of the Python * mapping stage. *

* Jet recognizes these special files in the base directory: *

  • * {@code requirements.txt} is assumed to list the * dependencies of your Python code. Jet will automatically install * them to a job-local virtual environment. You can also install the * modules to the Jet servers' global Python environment in order to speed * up job initialization. Jet reuses the global modules and adds the * missing ones. *
  • * {@code init.sh} is assumed to be a Bash script that Jet will run when * initializing the job. *
  • * {@code cleanup.sh} is assumed to be a Bash script that Jet will run * when completing the job. *
* Regardless of local parallelism, the init and cleanup scripts run only * once per cluster member. They run within the context of the job-local * virtual Python environment. *

* To use this stage in a Hazelcast Jet cluster, Python must be installed * on every cluster member. Jet supports Python versions 3.5-3.7. If the * code has dependencies on non-standard Python modules, these must either * be pre-installed or the member machines must have access to the public * internet so that Jet can download and install them. A third option is * to write {@code init.sh} that uses a different way of installing the * dependencies. In that case make sure not to use the standard filename * {@code requirements.txt}, which Jet uses automatically. *

* The Python mapping stage produces log output at the {@code FINE} level * under the {@code com.hazelcast.jet.python} log category. This includes * all the output from launched subprocesses. * * @since 4.0 */ public class PythonServiceConfig implements Serializable { private static final String HANDLER_FUNCTION_DEFAULT = "transform_list"; private File baseDir; private File handlerFile; private String handlerModule; private String handlerFunction = HANDLER_FUNCTION_DEFAULT; /** * Validates the configuration and throws an exception of a mandatory * config option is missing. Called automatically from {@link * PythonTransforms#mapUsingPython}. */ public void validate() { StringJoiner missingMandatoryFields = new StringJoiner(", "); if (baseDir == null) { if (handlerFile == null) { missingMandatoryFields.add("(baseDir or handlerFile)"); } } else if (handlerModule == null) { missingMandatoryFields.add("handlerModule"); } if (handlerFunction == null) { missingMandatoryFields.add("handlerFunction"); } if (missingMandatoryFields.length() > 0) { throw new InvalidPythonServiceConfigException("The supplied Python Service configuration is missing these " + "mandatory fields: " + missingMandatoryFields); } } /** * Returns the Python {@linkplain #setBaseDir base directory}. */ @Nullable public File baseDir() { return baseDir; } /** * Sets the base directory where the Python files reside. When you set this, * also set the name of the {@link #setHandlerModule handler module} to * identify the location of the handler function (named {@code * transform_list()} by convention). *

* If all you need to deploy to Jet is in a single file, you can call {@link * #setHandlerFile} instead. */ @Nonnull public PythonServiceConfig setBaseDir(@Nonnull String baseDir) { if (handlerFile != null) { throw new IllegalArgumentException( "You already set handlerFile so you can't set baseDir." + " When using baseDir, set handlerModule instead."); } String baseDirStr = requireNonBlank(baseDir, "baseDir"); try { File dir = new File(baseDirStr).getCanonicalFile(); if (!dir.isDirectory()) { throw new IOException("Not a directory: " + dir); } this.baseDir = dir; } catch (IOException e) { throw new InvalidPythonServiceConfigException("Invalid baseDir argument", e); } return this; } /** * Returns the Python {@linkplain #setHandlerFile handler file}. */ @Nullable public File handlerFile() { return handlerFile; } /** * Sets the Python handler file. It must contain the {@linkplain * #setHandlerFunction handler function}. If your Python work is in more * than one file, call {@link #setBaseDir} instead. */ @Nonnull public PythonServiceConfig setHandlerFile(@Nonnull String handlerFile) { if (baseDir != null) { throw new IllegalStateException( "You already set baseDir so you can't set handlerFile." + " If you want to set the handler module, call setHandlerModule()."); } if (handlerModule != null) { throw new IllegalStateException( "You already set handlerModule, it would be overwritten by setting handlerFile"); } String handlerFileStr = requireNonBlank(handlerFile, "handlerFile"); if (!handlerFileStr.toLowerCase().endsWith(".py")) { throw new IllegalArgumentException("The handler file must be a .py file"); } try { File file = new File(handlerFileStr).getCanonicalFile(); if (!file.isFile() || !file.canRead()) { throw new IOException("Not a regular, readable file: " + file); } this.handlerFile = file; this.handlerModule = file.getName().replaceFirst("\\.py$", ""); } catch (IOException e) { throw new InvalidPythonServiceConfigException("Invalid handlerFile argument", e); } return this; } /** * Returns the {@linkplain #setHandlerModule handler module} name. * */ @Nullable public String handlerModule() { return handlerModule; } /** * Sets the name of the Python module that has the function that * transforms Jet pipeline data. */ @Nonnull public PythonServiceConfig setHandlerModule(@Nonnull String handlerModule) { if (handlerFile != null) { throw new IllegalStateException( "You already set handlerFile, it would be overwritten by setting handlerModule"); } this.handlerModule = requireNonBlank(handlerModule, "handlerModule"); return this; } /** * Returns the name of the {@linkplain #setHandlerFunction handler * function}. The default value is {@code transform_list}. */ @Nonnull public String handlerFunction() { return handlerFunction; } /** * Overrides the default name of the Python function that transforms Jet * pipeline data. The default name is {@value #HANDLER_FUNCTION_DEFAULT}. * It must be defined in the module you configured with {@link * #setHandlerModule}, must take a single argument that is a list of * strings, and return another list of strings which has the results of * transforming each item in the input list. There must be a strict * one-to-one match between the input and output lists. */ @Nonnull public PythonServiceConfig setHandlerFunction(@Nonnull String handlerFunction) { this.handlerFunction = requireNonBlank(handlerFunction, "handlerFunction"); return this; } private static String requireNonBlank(@Nonnull String in, @Nonnull String name) { in = in.trim(); if (in.isEmpty()) { throw new IllegalArgumentException("Parameter must not be blank: " + name); } return in; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy