All Downloads are FREE. Search and download functionalities are using the official Maven repository.

tech.ytsaurus.client.operations.MapperOrReducerSpec Maven / Gradle / Ivy

The newest version!
package tech.ytsaurus.client.operations;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;

import javax.annotation.Nullable;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import tech.ytsaurus.client.FileWriter;
import tech.ytsaurus.client.TransactionalClient;
import tech.ytsaurus.client.request.CreateNode;
import tech.ytsaurus.client.request.WriteFile;
import tech.ytsaurus.core.DataSize;
import tech.ytsaurus.core.GUID;
import tech.ytsaurus.core.JavaOptions;
import tech.ytsaurus.core.cypress.CypressNodeType;
import tech.ytsaurus.core.cypress.YPath;
import tech.ytsaurus.lang.NonNullApi;
import tech.ytsaurus.lang.NonNullFields;
import tech.ytsaurus.ysontree.YTreeBuilder;
import tech.ytsaurus.ysontree.YTreeNode;


/**
 * Immutable base class of {@link MapperSpec}, {@link ReducerSpec} and {@link VanillaJob}.
 */
@NonNullApi
@NonNullFields
public abstract class MapperOrReducerSpec implements UserJobSpec {
    public static final DataSize DEFAULT_MEMORY_LIMIT = DataSize.fromMegaBytes(512);
    public static final JavaOptions DEFAULT_JAVA_OPTIONS = JavaOptions.empty().withXmx(DEFAULT_MEMORY_LIMIT);

    private static final Logger logger = LoggerFactory.getLogger(MapperOrReducerSpec.class);

    protected final Class mainClazz;
    protected final MapperOrReducer mapperOrReducer;
    protected final Set additionalFiles;
    protected final JavaOptions javaOptions;
    protected final DataSize memoryLimit;
    protected final boolean useTmpfs;
    protected final @Nullable
    DataSize tmpfsSize;
    protected final @Nullable
    Double cpuLimit;
    protected final @Nullable
    Long jobTimeLimit;
    protected final @Nullable
    Integer jobCount;
    protected final Map environment;
    protected final List layerPaths;
    protected final @Nullable
    Integer customStatisticsCountLimit;
    protected final @Nullable
    Double memoryReserveFactor;
    protected final @Nullable
    String networkProject;
    protected final @Nullable
    Duration prepareTimeLimit;

    protected MapperOrReducerSpec(Class mainClazz, Builder builder) {
        if (builder.userJob == null) {
            throw new RuntimeException("userJob wasn't set");
        }

        this.mainClazz = mainClazz;
        mapperOrReducer = builder.userJob;
        additionalFiles = builder.additionalFiles;
        javaOptions = builder.javaOptions;
        memoryLimit = builder.memoryLimit;
        useTmpfs = builder.useTmpfs;
        tmpfsSize = builder.tmpfsSize;
        cpuLimit = builder.cpuLimit;
        jobTimeLimit = builder.jobTimeLimit;
        jobCount = builder.jobCount;
        environment = builder.environment;
        layerPaths = builder.layerPaths;
        customStatisticsCountLimit = builder.customStatisticsCountLimit;
        memoryReserveFactor = builder.memoryReserveFactor;
        networkProject = builder.networkProject;
        prepareTimeLimit = builder.prepareTimeLimit;
    }

    /**
     * @return name of mapper or reducer class.
     */
    public String getMapperOrReducerTitle() {
        return mapperOrReducer.getClass().getName();
    }

    /**
     * @return if actual row and table indexes will be available in OperationContext.
     */
    private boolean trackIndices() {
        return mapperOrReducer.trackIndices();
    }

    JobIo createJobIo(@Nullable JobIo jobIo) {
        jobIo = jobIo == null ? new JobIo() : jobIo;
        if (!trackIndices()) {
            return jobIo;
        }
        return jobIo.toBuilder()
                .setEnableRowIndex(true)
                .setEnableTableIndex(true)
                .build();
    }

    protected static class Resource {
        private final YPath path;
        private final List args;

        public Resource(YPath path, List args) {
            this.path = path;
            this.args = args;
        }
    }

    protected Optional detectResourcesUnsafe(
            TransactionalClient yt,
            MapperOrReducer mapperOrReducer,
            SpecPreparationContext context
    ) throws IOException {
        List args = new ArrayList<>();

        if (mapperOrReducer instanceof Serializable) {
            String fileName = GUID.create() + ".serializable";
            ByteArrayOutputStream baos = new ByteArrayOutputStream();
            ObjectOutputStream oos = new ObjectOutputStream(baos);
            oos.writeObject(mapperOrReducer);
            oos.close();

            byte[] bytes = baos.toByteArray();
            args.add("serializable");

            YPath path = context.getConfiguration().getTmpDir().child(fileName);

            yt.createNode(
                    CreateNode.builder().setType(CypressNodeType.FILE).setPath(path).build()
            ).join();
            FileWriter writer = yt.writeFile(new WriteFile(path.toString())).join();
            writer.write(bytes);
            writer.readyEvent().join();
            writer.close().join();

            args.add(fileName);

            return Optional.of(new Resource(
                    path.plusAdditionalAttribute("file_name", fileName),
                    args));
        }

        return Optional.empty();
    }

    private Optional detectResources(
            TransactionalClient yt, MapperOrReducer mapperOrReducer, SpecPreparationContext context) {
        try {
            return detectResourcesUnsafe(yt, mapperOrReducer, context);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    private String canonizeJavaPath(String javaPath) {
        String[] pathParts = javaPath.split(":");
        List canonicalPathParts = new ArrayList<>(pathParts.length);
        for (String path : pathParts) {
            try {
                canonicalPathParts.add(new File(path).getCanonicalPath());
            } catch (IOException ex) {
                throw new RuntimeException(ex);
            }
        }

        return String.join(":", canonicalPathParts);
    }

    /**
     * Upload necessary jars and files to YT if it is needed,
     * construct java command and create spec as yson.
     */
    @Override
    public YTreeBuilder prepare(
            YTreeBuilder builder, TransactionalClient yt, SpecPreparationContext specPreparationContext,
            FormatContext formatContext) {
        Set files = new HashSet<>(additionalFiles);

        boolean isLocalMode = specPreparationContext.getConfiguration().isLocalMode();
        String classPath;
        String libraryPath = null;

        if (isLocalMode) {
            classPath = canonizeJavaPath(System.getProperty("java.class.path"));
            libraryPath = canonizeJavaPath(System.getProperty("java.library.path"));
        } else {
            Set jars = specPreparationContext.getConfiguration().getJarsProcessor().uploadJars(
                    yt.getRootClient(), mapperOrReducer, isLocalMode);
            files.addAll(jars);
            List jarFileNames = jars.stream()
                    .map(x -> x.getAdditionalAttribute("file_name")
                            .map(YTreeNode::stringValue)
                            .orElseGet(x::name))
                    .collect(Collectors.toList());

            classPath = String.join(":", jarFileNames);
        }

        Set autoDetectedResources = specPreparationContext.getConfiguration().getJarsProcessor().uploadResources(
                yt.getRootClient(), mapperOrReducer);
        files.addAll(autoDetectedResources);

        Optional resource = detectResources(yt, mapperOrReducer, specPreparationContext);

        List args = new ArrayList<>();
        args.add(String.valueOf(formatContext.getOutputTableCount()
                .orElseThrow(IllegalArgumentException::new)));

        if (resource.isEmpty()) {
            args.add("simple");
            args.add(JavaYtRunner.normalizeClassName(mapperOrReducer.getClass().getName()));
        } else {
            args.addAll(resource.get().args);
            files.add(resource.get().path);
        }

        String javaBinary = specPreparationContext.getConfiguration().getJavaBinary();
        JavaOptions resultJavaOptions = JavaOptions.empty();

        for (String option : specPreparationContext.getConfiguration().getJavaOptions()) {
            resultJavaOptions = resultJavaOptions.withOption(option);
        }

        for (String option : this.javaOptions.getOptions()) {
            resultJavaOptions = resultJavaOptions.withOption(option);
        }

        return builder.beginMap()
                .key("command").value(
                        JavaYtRunner.command(javaBinary, classPath, libraryPath, resultJavaOptions,
                                mainClazz.getName(), args)
                )
                .key("input_format").value(mapperOrReducer.inputType().format(formatContext))
                .key("output_format").value(mapperOrReducer.outputType().format(formatContext))
                .key("file_paths").value(files, (b, t) ->
                        b.apply(t::toTree)
                )
                .key("memory_limit").value(memoryLimit.toBytes())
                .when(memoryReserveFactor != null, b -> b.key("memory_reserve_factor").value(memoryReserveFactor))
                .when(useTmpfs, b -> b.key("tmpfs_path").value(".").key("copy_files").value(true))
                .when(tmpfsSize != null, b -> b.key("tmpfs_size").value(Objects.requireNonNull(tmpfsSize).toBytes()))
                .when(cpuLimit != null, b -> b.key("cpu_limit").value(cpuLimit))
                .when(jobTimeLimit != null, b -> b.key("job_time_limit").value(jobTimeLimit))
                .when(jobCount != null, b -> b.key("job_count").value(jobCount))
                .key("environment").value(environment)
                .key("layer_paths").value(layerPaths.stream().map(YPath::toTree).collect(Collectors.toList()))
                .when(customStatisticsCountLimit != null, b -> b.key("custom_statistics_count_limit")
                        .value(customStatisticsCountLimit))
                .when(networkProject != null, b -> b.key("network_project").value(networkProject))
                .when(prepareTimeLimit != null, b -> b.key("prepare_time_limit")
                        .value(Objects.requireNonNull(prepareTimeLimit).toMillis()))
                .when(formatContext.getOutputStreams().isPresent(),
                        b -> b.key("output_streams").value(formatContext.getOutputStreams().get()))
                .endMap();
    }

    /**
     * Builder for {@link MapperOrReducerSpec}.
     */
    @NonNullApi
    @NonNullFields
    public abstract static class Builder> {
        @Nullable
        MapperOrReducer userJob = null;
        Set additionalFiles = Collections.emptySet();
        JavaOptions javaOptions = DEFAULT_JAVA_OPTIONS;
        DataSize memoryLimit = DEFAULT_MEMORY_LIMIT;
        boolean useTmpfs = false;
        @Nullable
        DataSize tmpfsSize = null;
        @Nullable
        Double cpuLimit = null;
        @Nullable
        Long jobTimeLimit = null;
        @Nullable
        Integer jobCount = null;
        Map environment = new HashMap<>();
        List layerPaths = new ArrayList<>();
        @Nullable
        Integer customStatisticsCountLimit = null;
        @Nullable
        Double memoryReserveFactor = null;
        @Nullable
        String networkProject = null;
        @Nullable
        Duration prepareTimeLimit = null; // defaults to 45 minutes

        public abstract MapperOrReducerSpec build();

        protected abstract T self();

        /**
         * Set user job, it is required parameter.
         */
        protected T setUserJob(MapperOrReducer userJob) {
            this.userJob = userJob;
            return self();
        }

        protected @Nullable
        MapperOrReducer getUserJob() {
            return userJob;
        }

        /**
         * Set additional files which should be available inside operation jobs.
         */
        public T setAdditionalFiles(Set additionalFiles) {
            this.additionalFiles = additionalFiles;
            return self();
        }

        /**
         * Set java options for the command which will be run.
         */
        public T setJavaOptions(JavaOptions javaOptions) {
            this.javaOptions = javaOptions;
            return self();
        }

        /**
         * Set memoryLimit which specifies how much memory job process can use.
         * By default, 512 MB.
         */
        public T setMemoryLimit(DataSize memoryLimit) {
            this.memoryLimit = memoryLimit;
            return self();
        }

        public T setUseTmpfs(boolean useTmpfs) {
            this.useTmpfs = useTmpfs;
            return self();
        }

        public T setTmpfsSize(@Nullable DataSize tmpfsSize) {
            this.tmpfsSize = tmpfsSize;
            return self();
        }

        /**
         * Set maximum number of CPU cores for a single job to use.
         */
        public T setCpuLimit(@Nullable Double cpuLimit) {
            this.cpuLimit = cpuLimit;
            return self();
        }

        /**
         * Set limit on job execution time.
         * Jobs that exceed this limit will be considered failed.
         */
        public T setJobTimeLimit(@Nullable Long jobTimeLimit) {
            this.jobTimeLimit = jobTimeLimit;
            return self();
        }

        /**
         * Set how many jobs should be run, it is advisory.
         */
        public T setJobCount(@Nullable Integer jobCount) {
            this.jobCount = jobCount;
            return self();
        }

        /**
         * Set a dictionary of environment variables that will be specified during the operation.
         */
        public T setEnvironment(Map environment) {
            this.environment = environment;
            return self();
        }

        /**
         * Set list of paths to porto layers in Cypress.
         * Layers are listed from top to bottom.
         */
        public T setLayerPaths(List layerPaths) {
            this.layerPaths = layerPaths;
            return self();
        }

        /**
         * Set limit on the number of user statistics that can be written from a job.
         */
        public T setCustomStatisticsCountLimit(@Nullable Integer customStatisticsCountLimit) {
            this.customStatisticsCountLimit = customStatisticsCountLimit;
            return self();
        }

        /**
         * Set memory reserve factor (fraction of memoryLimit that job gets at start).
         * 
         * documentation
         * 
         *
         * @param memoryReserveFactor memory reserve factor, if set to null default value (0.5) will be used.
         */
        public T setMemoryReserveFactor(@Nullable Double memoryReserveFactor) {
            this.memoryReserveFactor = memoryReserveFactor;
            return self();
        }

        public T setNetworkProject(@Nullable String networkProject) {
            this.networkProject = networkProject;
            return self();
        }

        /**
         * Set time limit for the job preparation stage.
         */
        public T setPrepareTimeLimit(@Nullable Duration prepareTimeLimit) {
            this.prepareTimeLimit = prepareTimeLimit;
            return self();
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy