All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.ignite.ml.dataset.Dataset Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.ignite.ml.dataset;

import java.io.Serializable;
import org.apache.ignite.ml.dataset.impl.cache.CacheBasedDataset;
import org.apache.ignite.ml.dataset.impl.local.LocalDataset;
import org.apache.ignite.ml.math.functions.IgniteBiConsumer;
import org.apache.ignite.ml.math.functions.IgniteBiFunction;
import org.apache.ignite.ml.math.functions.IgniteBinaryOperator;
import org.apache.ignite.ml.math.functions.IgniteConsumer;
import org.apache.ignite.ml.math.functions.IgniteFunction;
import org.apache.ignite.ml.math.functions.IgniteTriConsumer;
import org.apache.ignite.ml.math.functions.IgniteTriFunction;

/**
 * A dataset providing an API that allows to perform generic computations on a distributed data represented as a set of
 * partitions distributed across a cluster or placed locally. Every partition contains a {@code context} (reliably
 * stored segment) and {@code data} (unreliably stored segment, which can be recovered from an upstream data and a
 * {@code context} if needed). Computations are performed in a {@code MapReduce} manner, what allows to reduce a
 * network traffic for most of the machine learning algorithms.
 *
 * 

Dataset functionality allows to implement iterative machine learning algorithms via introducing computation * context. In case iterative algorithm requires to maintain a state available and updatable on every iteration this * state can be stored in the {@code context} of the partition and after that it will be available in further * computations even if the Ignite Cache partition will be moved to another node because of node failure or rebalancing. * *

Partition {@code context} should be {@link Serializable} to be saved in Ignite Cache. Partition {@code data} * should be {@link AutoCloseable} to allow system to clean up correspondent resources when partition {@code data} is * not needed anymore. * * @param Type of a partition {@code context}. * @param Type of a partition {@code data}. * * @see CacheBasedDataset * @see LocalDataset * @see DatasetFactory */ public interface Dataset extends AutoCloseable { /** * Applies the specified {@code map} function to every partition {@code data}, {@code context} and partition * index in the dataset and then reduces {@code map} results to final result by using the {@code reduce} function. * * @param map Function applied to every partition {@code data}, {@code context} and partition index. * @param reduce Function applied to results of {@code map} to get final result. * @param identity Identity. * @param Type of a result. * @return Final result. */ public R computeWithCtx(IgniteTriFunction map, IgniteBinaryOperator reduce, R identity); /** * Applies the specified {@code map} function to every partition {@code data} and partition index in the dataset * and then reduces {@code map} results to final result by using the {@code reduce} function. * * @param map Function applied to every partition {@code data} and partition index. * @param reduce Function applied to results of {@code map} to get final result. * @param identity Identity. * @param Type of a result. * @return Final result. */ public R compute(IgniteBiFunction map, IgniteBinaryOperator reduce, R identity); /** * Applies the specified {@code map} function to every partition {@code data}, {@code context} and partition * index in the dataset and then reduces {@code map} results to final result by using the {@code reduce} function. * * @param map Function applied to every partition {@code data}, {@code context} and partition index. * @param reduce Function applied to results of {@code map} to get final result. * @param Type of a result. * @return Final result. */ default public R computeWithCtx(IgniteTriFunction map, IgniteBinaryOperator reduce) { return computeWithCtx(map, reduce, null); } /** * Applies the specified {@code map} function to every partition {@code data} and partition index in the dataset * and then reduces {@code map} results to final result by using the {@code reduce} function. * * @param map Function applied to every partition {@code data} and partition index. * @param reduce Function applied to results of {@code map} to get final result. * @param Type of a result. * @return Final result. */ default public R compute(IgniteBiFunction map, IgniteBinaryOperator reduce) { return compute(map, reduce, null); } /** * Applies the specified {@code map} function to every partition {@code data} and {@code context} in the dataset * and then reduces {@code map} results to final result by using the {@code reduce} function. * * @param map Function applied to every partition {@code data} and {@code context}. * @param reduce Function applied to results of {@code map} to get final result. * @param identity Identity. * @param Type of a result. * @return Final result. */ default public R computeWithCtx(IgniteBiFunction map, IgniteBinaryOperator reduce, R identity) { return computeWithCtx((ctx, data, partIdx) -> map.apply(ctx, data), reduce, identity); } /** * Applies the specified {@code map} function to every partition {@code data} in the dataset and then reduces * {@code map} results to final result by using the {@code reduce} function. * * @param map Function applied to every partition {@code data}. * @param reduce Function applied to results of {@code map} to get final result. * @param identity Identity. * @param Type of a result. * @return Final result. */ default public R compute(IgniteFunction map, IgniteBinaryOperator reduce, R identity) { return compute((data, partIdx) -> map.apply(data), reduce, identity); } /** * Applies the specified {@code map} function to every partition {@code data} and {@code context} in the dataset * and then reduces {@code map} results to final result by using the {@code reduce} function. * * @param map Function applied to every partition {@code data} and {@code context}. * @param reduce Function applied to results of {@code map} to get final result. * @param Type of a result. * @return Final result. */ default public R computeWithCtx(IgniteBiFunction map, IgniteBinaryOperator reduce) { return computeWithCtx((ctx, data, partIdx) -> map.apply(ctx, data), reduce); } /** * Applies the specified {@code map} function to every partition {@code data} in the dataset and then reduces * {@code map} results to final result by using the {@code reduce} function. * * @param map Function applied to every partition {@code data}. * @param reduce Function applied to results of {@code map} to get final result. * @param Type of a result. * @return Final result. */ default public R compute(IgniteFunction map, IgniteBinaryOperator reduce) { return compute((data, partIdx) -> map.apply(data), reduce); } /** * Applies the specified {@code map} function to every partition {@code data}, {@code context} and partition * index in the dataset. * * @param map Function applied to every partition {@code data}, {@code context} and partition index. */ default public void computeWithCtx(IgniteTriConsumer map) { computeWithCtx((ctx, data, partIdx) -> { map.accept(ctx, data, partIdx); return null; }, (a, b) -> null); } /** * Applies the specified {@code map} function to every partition {@code data} in the dataset and partition index. * * @param map Function applied to every partition {@code data} and partition index. */ default public void compute(IgniteBiConsumer map) { compute((data, partIdx) -> { map.accept(data, partIdx); return null; }, (a, b) -> null); } /** * Applies the specified {@code map} function to every partition {@code data} and {@code context} in the dataset. * * @param map Function applied to every partition {@code data} and {@code context}. */ default public void computeWithCtx(IgniteBiConsumer map) { computeWithCtx((ctx, data, partIdx) -> map.accept(ctx, data)); } /** * Applies the specified {@code map} function to every partition {@code data} in the dataset. * * @param map Function applied to every partition {@code data}. */ default public void compute(IgniteConsumer map) { compute((data, partIdx) -> map.accept(data)); } /** * Wraps this dataset into the specified wrapper to introduce new functionality based on {@code compute} and * {@code computeWithCtx} methods. * * @param wrapper Dataset wrapper. * @param Type of a new wrapped dataset. * @return New wrapped dataset. */ default public > I wrap(IgniteFunction, I> wrapper) { return wrapper.apply(this); } }