All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.davidbracewell.apollo.ml.data.DatasetBuilder Maven / Gradle / Ivy

The newest version!
package com.davidbracewell.apollo.ml.data;

import com.davidbracewell.apollo.ml.*;
import com.davidbracewell.apollo.ml.data.source.DataSource;
import com.davidbracewell.apollo.ml.encoder.Encoder;
import com.davidbracewell.apollo.ml.encoder.IndexEncoder;
import com.davidbracewell.apollo.ml.encoder.LabelEncoder;
import com.davidbracewell.apollo.ml.preprocess.PreprocessorList;
import com.davidbracewell.guava.common.base.Throwables;
import com.davidbracewell.io.resource.Resource;
import com.davidbracewell.stream.MStream;
import com.davidbracewell.stream.StreamingContext;
import lombok.NonNull;

import java.io.IOException;
import java.util.Collection;
import java.util.stream.Stream;

/**
 * 

Builder for {@link Dataset}

* * @param the example type parameter * @author David B. Bracewell */ public class DatasetBuilder { private final LabelEncoder labelEncoder; private final Class exampleType; private DataSource dataSource; private DatasetType type = DatasetType.InMemory; private Encoder featureEncoder = new IndexEncoder(); private MStream source; private Resource load; /** * Instantiates a new Dataset builder. * * @param labelEncoder the label encoder * @param exampleType the example type */ protected DatasetBuilder(@NonNull LabelEncoder labelEncoder, @NonNull Class exampleType) { this.labelEncoder = labelEncoder; this.exampleType = exampleType; } private Dataset createDataset() { switch (type) { case Distributed: return new DistributedDataset<>(featureEncoder, labelEncoder, PreprocessorList.empty()); case OffHeap: return new OffHeapDataset<>(featureEncoder, labelEncoder, PreprocessorList.empty()); default: return new InMemoryDataset<>(featureEncoder, labelEncoder, PreprocessorList.empty()); } } /** * Sets the feature encoder to use. * * @param featureEncoder the feature encoder */ public DatasetBuilder featureEncoder(@NonNull Encoder featureEncoder) { this.featureEncoder = featureEncoder; return this; } /** * Sets the feature encoder to use. * * @param datasetFile the feature encoder */ public Dataset load(@NonNull Resource datasetFile) { try { return createDataset().read(datasetFile, exampleType); } catch (IOException e) { throw Throwables.propagate(e); } } /** * Sets the streaming source from a Java Stream. * * @param stream the stream * @return the dataset builder */ public Dataset source(@NonNull Stream stream) { Dataset dataset = createDataset(); dataset.addAll(StreamingContext.local().stream(stream)); return dataset; } /** * Sets the streaming source from a collection of examples. * * @param collection the collection of examples * @return the dataset builder */ public Dataset source(@NonNull Collection collection) { Dataset dataset = createDataset(); dataset.addAll(StreamingContext.local().stream(collection)); return dataset; } /** * Sets the examples to be read in from the given data source. * * @param dataSource the data source * @return the dataset builder */ public Dataset source(@NonNull DataSource dataSource) { Dataset dataset = createDataset(); dataSource.setStreamingContext(type.getStreamingContext()); try { dataset.addAll(dataSource.stream()); } catch (IOException e) { throw Throwables.propagate(e); } return dataset; } /** * Sets the streaming source from a Mango Stream. * * @param stream the stream * @return the dataset builder */ public Dataset source(@NonNull MStream stream) { Dataset dataset = createDataset(); dataset.addAll(stream); return dataset; } /** * Sets the type (In-Memory, Distributed, or Off Heap) of the dataset * * @param type the type * @return the dataset builder */ public DatasetBuilder type(@NonNull DatasetType type) { this.type = type; return this; } }// END OF DatasetBuilder




© 2015 - 2025 Weber Informatics LLC | Privacy Policy