org.apache.ignite.ml.dataset.DatasetFactory Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.ignite.ml.dataset;
import java.io.Serializable;
import java.util.Map;
import org.apache.ignite.Ignite;
import org.apache.ignite.IgniteCache;
import org.apache.ignite.ml.dataset.impl.cache.CacheBasedDatasetBuilder;
import org.apache.ignite.ml.dataset.impl.local.LocalDatasetBuilder;
import org.apache.ignite.ml.dataset.primitive.SimpleDataset;
import org.apache.ignite.ml.dataset.primitive.SimpleLabeledDataset;
import org.apache.ignite.ml.dataset.primitive.builder.context.EmptyContextBuilder;
import org.apache.ignite.ml.dataset.primitive.builder.data.SimpleDatasetDataBuilder;
import org.apache.ignite.ml.dataset.primitive.builder.data.SimpleLabeledDatasetDataBuilder;
import org.apache.ignite.ml.dataset.primitive.context.EmptyContext;
import org.apache.ignite.ml.dataset.primitive.data.SimpleDatasetData;
import org.apache.ignite.ml.dataset.primitive.data.SimpleLabeledDatasetData;
import org.apache.ignite.ml.math.functions.IgniteBiFunction;
/**
* Factory providing a client facing API that allows to construct basic and the most frequently used types of dataset.
*
*
* Dataset construction is based on three major concepts: a partition {@code upstream}, {@code context} and
* {@code data}. A partition {@code upstream} is a data source, which assumed to be available all the time regardless
* node failures and rebalancing events. A partition {@code context} is a part of a partition maintained during the
* whole computation process and stored in a reliable storage so that a {@code context} is staying available and
* consistent regardless node failures and rebalancing events as well as an {@code upstream}. A partition {@code data}
* is a part of partition maintained during a computation process in unreliable local storage such as heap, off-heap or
* GPU memory on the node where current computation is performed, so that partition {@code data} can be lost as result
* of node failure or rebalancing, but it can be restored from an {@code upstream} and a partition {@code context}.
*
*
A partition {@code context} and {@code data} are built on top of an {@code upstream} by using specified
* builders: {@link PartitionContextBuilder} and {@link PartitionDataBuilder} correspondingly. To build a generic
* dataset the following approach is used:
*
*
* {@code
* Dataset dataset = DatasetFactory.create(
* ignite,
* cache,
* partitionContextBuilder,
* partitionDataBuilder
* );
* }
*
*
*
As well as the generic building method {@code create} this factory provides methods that allow to create a
* specific dataset types such as method {@code createSimpleDataset} to create {@link SimpleDataset} and method
* {@code createSimpleLabeledDataset} to create {@link SimpleLabeledDataset}.
*
* @see Dataset
* @see PartitionContextBuilder
* @see PartitionDataBuilder
*/
public class DatasetFactory {
/**
* Creates a new instance of distributed dataset using the specified {@code partCtxBuilder} and
* {@code partDataBuilder}. This is the generic methods that allows to create any Ignite Cache based datasets with
* any desired partition {@code context} and {@code data}.
*
* @param datasetBuilder Dataset builder.
* @param partCtxBuilder Partition {@code context} builder.
* @param partDataBuilder Partition {@code data} builder.
* @param Type of a key in {@code upstream} data.
* @param ype of a value in {@code upstream} data.
* @param Type of a partition {@code context}.
* @param Type of a partition {@code data}.
* @return Dataset.
*/
public static Dataset create(
DatasetBuilder datasetBuilder, PartitionContextBuilder partCtxBuilder,
PartitionDataBuilder partDataBuilder) {
return datasetBuilder.build(
partCtxBuilder,
partDataBuilder
);
}
/**
* Creates a new instance of distributed dataset using the specified {@code partCtxBuilder} and
* {@code partDataBuilder}. This is the generic methods that allows to create any Ignite Cache based datasets with
* any desired partition {@code context} and {@code data}.
*
* @param ignite Ignite instance.
* @param upstreamCache Ignite Cache with {@code upstream} data.
* @param partCtxBuilder Partition {@code context} builder.
* @param partDataBuilder Partition {@code data} builder.
* @param Type of a key in {@code upstream} data.
* @param Type of a value in {@code upstream} data.
* @param Type of a partition {@code context}.
* @param Type of a partition {@code data}.
* @return Dataset.
*/
public static Dataset create(
Ignite ignite, IgniteCache upstreamCache, PartitionContextBuilder partCtxBuilder,
PartitionDataBuilder partDataBuilder) {
return create(
new CacheBasedDatasetBuilder<>(ignite, upstreamCache),
partCtxBuilder,
partDataBuilder
);
}
/**
* Creates a new instance of distributed {@link SimpleDataset} using the specified {@code partCtxBuilder} and
* {@code featureExtractor}. This methods determines partition {@code data} to be {@link SimpleDatasetData}, but
* allows to use any desired type of partition {@code context}.
*
* @param datasetBuilder Dataset builder.
* @param partCtxBuilder Partition {@code context} builder.
* @param featureExtractor Feature extractor used to extract features and build {@link SimpleDatasetData}.
* @param Type of a key in {@code upstream} data.
* @param Type of a value in {@code upstream} data.
* @param Type of a partition {@code context}.
* @return Dataset.
*/
public static SimpleDataset createSimpleDataset(
DatasetBuilder datasetBuilder, PartitionContextBuilder partCtxBuilder,
IgniteBiFunction featureExtractor) {
return create(
datasetBuilder,
partCtxBuilder,
new SimpleDatasetDataBuilder<>(featureExtractor)
).wrap(SimpleDataset::new);
}
/**
* Creates a new instance of distributed {@link SimpleDataset} using the specified {@code partCtxBuilder} and
* {@code featureExtractor}. This methods determines partition {@code data} to be {@link SimpleDatasetData}, but
* allows to use any desired type of partition {@code context}.
*
* @param ignite Ignite instance.
* @param upstreamCache Ignite Cache with {@code upstream} data.
* @param partCtxBuilder Partition {@code context} builder.
* @param featureExtractor Feature extractor used to extract features and build {@link SimpleDatasetData}.
* @param Type of a key in {@code upstream} data.
* @param Type of a value in {@code upstream} data.
* @param Type of a partition {@code context}.
* @return Dataset.
*/
public static SimpleDataset createSimpleDataset(Ignite ignite,
IgniteCache upstreamCache, PartitionContextBuilder partCtxBuilder,
IgniteBiFunction featureExtractor) {
return createSimpleDataset(
new CacheBasedDatasetBuilder<>(ignite, upstreamCache),
partCtxBuilder,
featureExtractor
);
}
/**
* Creates a new instance of distributed {@link SimpleLabeledDataset} using the specified {@code partCtxBuilder},
* {@code featureExtractor} and {@code lbExtractor}. This method determines partition {@code data} to be
* {@link SimpleLabeledDatasetData}, but allows to use any desired type of partition {@code context}.
*
* @param datasetBuilder Dataset builder.
* @param partCtxBuilder Partition {@code context} builder.
* @param featureExtractor Feature extractor used to extract features and build {@link SimpleLabeledDatasetData}.
* @param lbExtractor Label extractor used to extract labels and buikd {@link SimpleLabeledDatasetData}.
* @param Type of a key in {@code upstream} data.
* @param Type of a value in {@code upstream} data.
* @param Type of a partition {@code context}.
* @return Dataset.
*/
public static SimpleLabeledDataset createSimpleLabeledDataset(
DatasetBuilder datasetBuilder, PartitionContextBuilder partCtxBuilder,
IgniteBiFunction featureExtractor, IgniteBiFunction lbExtractor) {
return create(
datasetBuilder,
partCtxBuilder,
new SimpleLabeledDatasetDataBuilder<>(featureExtractor, lbExtractor)
).wrap(SimpleLabeledDataset::new);
}
/**
* Creates a new instance of distributed {@link SimpleLabeledDataset} using the specified {@code partCtxBuilder},
* {@code featureExtractor} and {@code lbExtractor}. This method determines partition {@code data} to be
* {@link SimpleLabeledDatasetData}, but allows to use any desired type of partition {@code context}.
*
* @param ignite Ignite instance.
* @param upstreamCache Ignite Cache with {@code upstream} data.
* @param partCtxBuilder Partition {@code context} builder.
* @param featureExtractor Feature extractor used to extract features and build {@link SimpleLabeledDatasetData}.
* @param lbExtractor Label extractor used to extract labels and buikd {@link SimpleLabeledDatasetData}.
* @param Type of a key in {@code upstream} data.
* @param Type of a value in {@code upstream} data.
* @param Type of a partition {@code context}.
* @return Dataset.
*/
public static SimpleLabeledDataset createSimpleLabeledDataset(Ignite ignite,
IgniteCache upstreamCache, PartitionContextBuilder partCtxBuilder,
IgniteBiFunction featureExtractor, IgniteBiFunction lbExtractor) {
return createSimpleLabeledDataset(
new CacheBasedDatasetBuilder<>(ignite, upstreamCache),
partCtxBuilder,
featureExtractor,
lbExtractor
);
}
/**
* Creates a new instance of distributed {@link SimpleDataset} using the specified {@code featureExtractor}. This
* methods determines partition {@code context} to be {@link EmptyContext} and partition {@code data} to be
* {@link SimpleDatasetData}.
*
* @param datasetBuilder Dataset builder.
* @param featureExtractor Feature extractor used to extract features and build {@link SimpleDatasetData}.
* @param Type of a key in {@code upstream} data.
* @param Type of a value in {@code upstream} data.
* @return Dataset.
*/
public static SimpleDataset createSimpleDataset(DatasetBuilder datasetBuilder,
IgniteBiFunction featureExtractor) {
return createSimpleDataset(
datasetBuilder,
new EmptyContextBuilder<>(),
featureExtractor
);
}
/**
* Creates a new instance of distributed {@link SimpleDataset} using the specified {@code featureExtractor}. This
* methods determines partition {@code context} to be {@link EmptyContext} and partition {@code data} to be
* {@link SimpleDatasetData}.
*
* @param ignite Ignite instance.
* @param upstreamCache Ignite Cache with {@code upstream} data.
* @param featureExtractor Feature extractor used to extract features and build {@link SimpleDatasetData}.
* @param Type of a key in {@code upstream} data.
* @param Type of a value in {@code upstream} data.
* @return Dataset.
*/
public static SimpleDataset createSimpleDataset(Ignite ignite, IgniteCache upstreamCache,
IgniteBiFunction featureExtractor) {
return createSimpleDataset(
new CacheBasedDatasetBuilder<>(ignite, upstreamCache),
featureExtractor
);
}
/**
* Creates a new instance of distributed {@link SimpleLabeledDataset} using the specified {@code featureExtractor}
* and {@code lbExtractor}. This methods determines partition {@code context} to be {@link EmptyContext} and
* partition {@code data} to be {@link SimpleLabeledDatasetData}.
*
* @param datasetBuilder Dataset builder.
* @param featureExtractor Feature extractor used to extract features and build {@link SimpleLabeledDatasetData}.
* @param lbExtractor Label extractor used to extract labels and buikd {@link SimpleLabeledDatasetData}.
* @param Type of a key in {@code upstream} data.
* @param Type of a value in {@code upstream} data.
* @return Dataset.
*/
public static SimpleLabeledDataset createSimpleLabeledDataset(
DatasetBuilder datasetBuilder, IgniteBiFunction featureExtractor,
IgniteBiFunction lbExtractor) {
return createSimpleLabeledDataset(
datasetBuilder,
new EmptyContextBuilder<>(),
featureExtractor,
lbExtractor
);
}
/**
* Creates a new instance of distributed {@link SimpleLabeledDataset} using the specified {@code featureExtractor}
* and {@code lbExtractor}. This methods determines partition {@code context} to be {@link EmptyContext} and
* partition {@code data} to be {@link SimpleLabeledDatasetData}.
*
* @param ignite Ignite instance.
* @param upstreamCache Ignite Cache with {@code upstream} data.
* @param featureExtractor Feature extractor used to extract features and build {@link SimpleLabeledDatasetData}.
* @param lbExtractor Label extractor used to extract labels and buikd {@link SimpleLabeledDatasetData}.
* @param Type of a key in {@code upstream} data.
* @param Type of a value in {@code upstream} data.
* @return Dataset.
*/
public static SimpleLabeledDataset createSimpleLabeledDataset(Ignite ignite,
IgniteCache upstreamCache, IgniteBiFunction featureExtractor,
IgniteBiFunction lbExtractor) {
return createSimpleLabeledDataset(
new CacheBasedDatasetBuilder<>(ignite, upstreamCache),
featureExtractor,
lbExtractor
);
}
/**
* Creates a new instance of local dataset using the specified {@code partCtxBuilder} and {@code partDataBuilder}.
* This is the generic methods that allows to create any Ignite Cache based datasets with any desired partition
* {@code context} and {@code data}.
*
* @param upstreamMap {@code Map} with {@code upstream} data.
* @param partitions Number of partitions {@code upstream} {@code Map} will be divided on.
* @param partCtxBuilder Partition {@code context} builder.
* @param partDataBuilder Partition {@code data} builder.
* @param Type of a key in {@code upstream} data.
* @param Type of a value in {@code upstream} data.
* @param Type of a partition {@code context}.
* @param Type of a partition {@code data}.
* @return Dataset.
*/
public static Dataset create(
Map upstreamMap, int partitions, PartitionContextBuilder partCtxBuilder,
PartitionDataBuilder partDataBuilder) {
return create(
new LocalDatasetBuilder<>(upstreamMap, partitions),
partCtxBuilder,
partDataBuilder
);
}
/**
* Creates a new instance of local {@link SimpleDataset} using the specified {@code partCtxBuilder} and
* {@code featureExtractor}. This methods determines partition {@code data} to be {@link SimpleDatasetData}, but
* allows to use any desired type of partition {@code context}.
*
* @param upstreamMap {@code Map} with {@code upstream} data.
* @param partitions Number of partitions {@code upstream} {@code Map} will be divided on.
* @param partCtxBuilder Partition {@code context} builder.
* @param featureExtractor Feature extractor used to extract features and build {@link SimpleDatasetData}.
* @param Type of a key in {@code upstream} data.
* @param Type of a value in {@code upstream} data.
* @param Type of a partition {@code context}.
* @return Dataset.
*/
public static SimpleDataset createSimpleDataset(Map upstreamMap,
int partitions, PartitionContextBuilder partCtxBuilder,
IgniteBiFunction featureExtractor) {
return createSimpleDataset(
new LocalDatasetBuilder<>(upstreamMap, partitions),
partCtxBuilder,
featureExtractor
);
}
/**
* Creates a new instance of local {@link SimpleLabeledDataset} using the specified {@code partCtxBuilder},
* {@code featureExtractor} and {@code lbExtractor}. This method determines partition {@code data} to be
* {@link SimpleLabeledDatasetData}, but allows to use any desired type of partition {@code context}.
*
* @param upstreamMap {@code Map} with {@code upstream} data.
* @param partitions Number of partitions {@code upstream} {@code Map} will be divided on.
* @param partCtxBuilder Partition {@code context} builder.
* @param featureExtractor Feature extractor used to extract features and build {@link SimpleLabeledDatasetData}.
* @param lbExtractor Label extractor used to extract labels and buikd {@link SimpleLabeledDatasetData}.
* @param Type of a key in {@code upstream} data.
* @param Type of a value in {@code upstream} data.
* @param Type of a partition {@code context}.
* @return Dataset.
*/
public static SimpleLabeledDataset createSimpleLabeledDataset(
Map upstreamMap, int partitions, PartitionContextBuilder partCtxBuilder,
IgniteBiFunction featureExtractor, IgniteBiFunction lbExtractor) {
return createSimpleLabeledDataset(
new LocalDatasetBuilder<>(upstreamMap, partitions),
partCtxBuilder,
featureExtractor, lbExtractor
);
}
/**
* Creates a new instance of local {@link SimpleDataset} using the specified {@code featureExtractor}. This
* methods determines partition {@code context} to be {@link EmptyContext} and partition {@code data} to be
* {@link SimpleDatasetData}.
*
* @param upstreamMap {@code Map} with {@code upstream} data.
* @param partitions Number of partitions {@code upstream} {@code Map} will be divided on.
* @param featureExtractor Feature extractor used to extract features and build {@link SimpleDatasetData}.
* @param Type of a key in {@code upstream} data.
* @param Type of a value in {@code upstream} data.
* @return Dataset.
*/
public static SimpleDataset createSimpleDataset(Map upstreamMap, int partitions,
IgniteBiFunction featureExtractor) {
return createSimpleDataset(
new LocalDatasetBuilder<>(upstreamMap, partitions),
featureExtractor
);
}
/**
* Creates a new instance of local {@link SimpleLabeledDataset} using the specified {@code featureExtractor}
* and {@code lbExtractor}. This methods determines partition {@code context} to be {@link EmptyContext} and
* partition {@code data} to be {@link SimpleLabeledDatasetData}.
*
* @param upstreamMap {@code Map} with {@code upstream} data.
* @param partitions Number of partitions {@code upstream} {@code Map} will be divided on.
* @param featureExtractor Feature extractor used to extract features and build {@link SimpleLabeledDatasetData}.
* @param lbExtractor Label extractor used to extract labels and build {@link SimpleLabeledDatasetData}.
* @param Type of a key in {@code upstream} data.
* @param Type of a value in {@code upstream} data.
* @return Dataset.
*/
public static SimpleLabeledDataset createSimpleLabeledDataset(Map upstreamMap,
int partitions, IgniteBiFunction featureExtractor,
IgniteBiFunction lbExtractor) {
return createSimpleLabeledDataset(
new LocalDatasetBuilder<>(upstreamMap, partitions),
featureExtractor,
lbExtractor
);
}
}