org.apache.ignite.ml.dataset.Dataset Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.ignite.ml.dataset;
import java.io.Serializable;
import org.apache.ignite.ml.dataset.impl.cache.CacheBasedDataset;
import org.apache.ignite.ml.dataset.impl.local.LocalDataset;
import org.apache.ignite.ml.math.functions.IgniteBiConsumer;
import org.apache.ignite.ml.math.functions.IgniteBiFunction;
import org.apache.ignite.ml.math.functions.IgniteBinaryOperator;
import org.apache.ignite.ml.math.functions.IgniteConsumer;
import org.apache.ignite.ml.math.functions.IgniteFunction;
import org.apache.ignite.ml.math.functions.IgniteTriConsumer;
import org.apache.ignite.ml.math.functions.IgniteTriFunction;
/**
* A dataset providing an API that allows to perform generic computations on a distributed data represented as a set of
* partitions distributed across a cluster or placed locally. Every partition contains a {@code context} (reliably
* stored segment) and {@code data} (unreliably stored segment, which can be recovered from an upstream data and a
* {@code context} if needed). Computations are performed in a {@code MapReduce} manner, what allows to reduce a
* network traffic for most of the machine learning algorithms.
*
* Dataset functionality allows to implement iterative machine learning algorithms via introducing computation
* context. In case iterative algorithm requires to maintain a state available and updatable on every iteration this
* state can be stored in the {@code context} of the partition and after that it will be available in further
* computations even if the Ignite Cache partition will be moved to another node because of node failure or rebalancing.
*
*
Partition {@code context} should be {@link Serializable} to be saved in Ignite Cache. Partition {@code data}
* should be {@link AutoCloseable} to allow system to clean up correspondent resources when partition {@code data} is
* not needed anymore.
*
* @param Type of a partition {@code context}.
* @param Type of a partition {@code data}.
*
* @see CacheBasedDataset
* @see LocalDataset
* @see DatasetFactory
*/
public interface Dataset extends AutoCloseable {
/**
* Applies the specified {@code map} function to every partition {@code data}, {@code context} and partition
* index in the dataset and then reduces {@code map} results to final result by using the {@code reduce} function.
*
* @param map Function applied to every partition {@code data}, {@code context} and partition index.
* @param reduce Function applied to results of {@code map} to get final result.
* @param identity Identity.
* @param Type of a result.
* @return Final result.
*/
public R computeWithCtx(IgniteTriFunction map, IgniteBinaryOperator reduce, R identity);
/**
* Applies the specified {@code map} function to every partition {@code data} and partition index in the dataset
* and then reduces {@code map} results to final result by using the {@code reduce} function.
*
* @param map Function applied to every partition {@code data} and partition index.
* @param reduce Function applied to results of {@code map} to get final result.
* @param identity Identity.
* @param Type of a result.
* @return Final result.
*/
public R compute(IgniteBiFunction map, IgniteBinaryOperator reduce, R identity);
/**
* Applies the specified {@code map} function to every partition {@code data}, {@code context} and partition
* index in the dataset and then reduces {@code map} results to final result by using the {@code reduce} function.
*
* @param map Function applied to every partition {@code data}, {@code context} and partition index.
* @param reduce Function applied to results of {@code map} to get final result.
* @param Type of a result.
* @return Final result.
*/
default public R computeWithCtx(IgniteTriFunction map, IgniteBinaryOperator reduce) {
return computeWithCtx(map, reduce, null);
}
/**
* Applies the specified {@code map} function to every partition {@code data} and partition index in the dataset
* and then reduces {@code map} results to final result by using the {@code reduce} function.
*
* @param map Function applied to every partition {@code data} and partition index.
* @param reduce Function applied to results of {@code map} to get final result.
* @param Type of a result.
* @return Final result.
*/
default public R compute(IgniteBiFunction map, IgniteBinaryOperator reduce) {
return compute(map, reduce, null);
}
/**
* Applies the specified {@code map} function to every partition {@code data} and {@code context} in the dataset
* and then reduces {@code map} results to final result by using the {@code reduce} function.
*
* @param map Function applied to every partition {@code data} and {@code context}.
* @param reduce Function applied to results of {@code map} to get final result.
* @param identity Identity.
* @param Type of a result.
* @return Final result.
*/
default public R computeWithCtx(IgniteBiFunction map, IgniteBinaryOperator reduce, R identity) {
return computeWithCtx((ctx, data, partIdx) -> map.apply(ctx, data), reduce, identity);
}
/**
* Applies the specified {@code map} function to every partition {@code data} in the dataset and then reduces
* {@code map} results to final result by using the {@code reduce} function.
*
* @param map Function applied to every partition {@code data}.
* @param reduce Function applied to results of {@code map} to get final result.
* @param identity Identity.
* @param Type of a result.
* @return Final result.
*/
default public R compute(IgniteFunction map, IgniteBinaryOperator reduce, R identity) {
return compute((data, partIdx) -> map.apply(data), reduce, identity);
}
/**
* Applies the specified {@code map} function to every partition {@code data} and {@code context} in the dataset
* and then reduces {@code map} results to final result by using the {@code reduce} function.
*
* @param map Function applied to every partition {@code data} and {@code context}.
* @param reduce Function applied to results of {@code map} to get final result.
* @param Type of a result.
* @return Final result.
*/
default public R computeWithCtx(IgniteBiFunction map, IgniteBinaryOperator reduce) {
return computeWithCtx((ctx, data, partIdx) -> map.apply(ctx, data), reduce);
}
/**
* Applies the specified {@code map} function to every partition {@code data} in the dataset and then reduces
* {@code map} results to final result by using the {@code reduce} function.
*
* @param map Function applied to every partition {@code data}.
* @param reduce Function applied to results of {@code map} to get final result.
* @param Type of a result.
* @return Final result.
*/
default public R compute(IgniteFunction map, IgniteBinaryOperator reduce) {
return compute((data, partIdx) -> map.apply(data), reduce);
}
/**
* Applies the specified {@code map} function to every partition {@code data}, {@code context} and partition
* index in the dataset.
*
* @param map Function applied to every partition {@code data}, {@code context} and partition index.
*/
default public void computeWithCtx(IgniteTriConsumer map) {
computeWithCtx((ctx, data, partIdx) -> {
map.accept(ctx, data, partIdx);
return null;
}, (a, b) -> null);
}
/**
* Applies the specified {@code map} function to every partition {@code data} in the dataset and partition index.
*
* @param map Function applied to every partition {@code data} and partition index.
*/
default public void compute(IgniteBiConsumer map) {
compute((data, partIdx) -> {
map.accept(data, partIdx);
return null;
}, (a, b) -> null);
}
/**
* Applies the specified {@code map} function to every partition {@code data} and {@code context} in the dataset.
*
* @param map Function applied to every partition {@code data} and {@code context}.
*/
default public void computeWithCtx(IgniteBiConsumer map) {
computeWithCtx((ctx, data, partIdx) -> map.accept(ctx, data));
}
/**
* Applies the specified {@code map} function to every partition {@code data} in the dataset.
*
* @param map Function applied to every partition {@code data}.
*/
default public void compute(IgniteConsumer map) {
compute((data, partIdx) -> map.accept(data));
}
/**
* Wraps this dataset into the specified wrapper to introduce new functionality based on {@code compute} and
* {@code computeWithCtx} methods.
*
* @param wrapper Dataset wrapper.
* @param Type of a new wrapped dataset.
* @return New wrapped dataset.
*/
default public > I wrap(IgniteFunction, I> wrapper) {
return wrapper.apply(this);
}
}