All Downloads are FREE. Search and download functionalities are using the official Maven repository.

software.amazon.awssdk.services.machinelearning.model.RedshiftDataSpec Maven / Gradle / Ivy

/*
 * Copyright 2014-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with
 * the License. A copy of the License is located at
 * 
 * http://aws.amazon.com/apache2.0
 * 
 * or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
 * and limitations under the License.
 */

package software.amazon.awssdk.services.machinelearning.model;

import java.io.Serializable;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.function.BiConsumer;
import java.util.function.Consumer;
import java.util.function.Function;
import software.amazon.awssdk.annotations.Generated;
import software.amazon.awssdk.core.SdkField;
import software.amazon.awssdk.core.SdkPojo;
import software.amazon.awssdk.core.protocol.MarshallLocation;
import software.amazon.awssdk.core.protocol.MarshallingType;
import software.amazon.awssdk.core.traits.LocationTrait;
import software.amazon.awssdk.utils.ToString;
import software.amazon.awssdk.utils.builder.CopyableBuilder;
import software.amazon.awssdk.utils.builder.ToCopyableBuilder;

/**
 * 

* Describes the data specification of an Amazon Redshift DataSource. *

*/ @Generated("software.amazon.awssdk:codegen") public final class RedshiftDataSpec implements SdkPojo, Serializable, ToCopyableBuilder { private static final SdkField DATABASE_INFORMATION_FIELD = SdkField . builder(MarshallingType.SDK_POJO).getter(getter(RedshiftDataSpec::databaseInformation)) .setter(setter(Builder::databaseInformation)).constructor(RedshiftDatabase::builder) .traits(LocationTrait.builder().location(MarshallLocation.PAYLOAD).locationName("DatabaseInformation").build()) .build(); private static final SdkField SELECT_SQL_QUERY_FIELD = SdkField. builder(MarshallingType.STRING) .getter(getter(RedshiftDataSpec::selectSqlQuery)).setter(setter(Builder::selectSqlQuery)) .traits(LocationTrait.builder().location(MarshallLocation.PAYLOAD).locationName("SelectSqlQuery").build()).build(); private static final SdkField DATABASE_CREDENTIALS_FIELD = SdkField . builder(MarshallingType.SDK_POJO) .getter(getter(RedshiftDataSpec::databaseCredentials)).setter(setter(Builder::databaseCredentials)) .constructor(RedshiftDatabaseCredentials::builder) .traits(LocationTrait.builder().location(MarshallLocation.PAYLOAD).locationName("DatabaseCredentials").build()) .build(); private static final SdkField S3_STAGING_LOCATION_FIELD = SdkField. builder(MarshallingType.STRING) .getter(getter(RedshiftDataSpec::s3StagingLocation)).setter(setter(Builder::s3StagingLocation)) .traits(LocationTrait.builder().location(MarshallLocation.PAYLOAD).locationName("S3StagingLocation").build()).build(); private static final SdkField DATA_REARRANGEMENT_FIELD = SdkField. builder(MarshallingType.STRING) .getter(getter(RedshiftDataSpec::dataRearrangement)).setter(setter(Builder::dataRearrangement)) .traits(LocationTrait.builder().location(MarshallLocation.PAYLOAD).locationName("DataRearrangement").build()).build(); private static final SdkField DATA_SCHEMA_FIELD = SdkField. builder(MarshallingType.STRING) .getter(getter(RedshiftDataSpec::dataSchema)).setter(setter(Builder::dataSchema)) .traits(LocationTrait.builder().location(MarshallLocation.PAYLOAD).locationName("DataSchema").build()).build(); private static final SdkField DATA_SCHEMA_URI_FIELD = SdkField. builder(MarshallingType.STRING) .getter(getter(RedshiftDataSpec::dataSchemaUri)).setter(setter(Builder::dataSchemaUri)) .traits(LocationTrait.builder().location(MarshallLocation.PAYLOAD).locationName("DataSchemaUri").build()).build(); private static final List> SDK_FIELDS = Collections.unmodifiableList(Arrays.asList(DATABASE_INFORMATION_FIELD, SELECT_SQL_QUERY_FIELD, DATABASE_CREDENTIALS_FIELD, S3_STAGING_LOCATION_FIELD, DATA_REARRANGEMENT_FIELD, DATA_SCHEMA_FIELD, DATA_SCHEMA_URI_FIELD)); private static final long serialVersionUID = 1L; private final RedshiftDatabase databaseInformation; private final String selectSqlQuery; private final RedshiftDatabaseCredentials databaseCredentials; private final String s3StagingLocation; private final String dataRearrangement; private final String dataSchema; private final String dataSchemaUri; private RedshiftDataSpec(BuilderImpl builder) { this.databaseInformation = builder.databaseInformation; this.selectSqlQuery = builder.selectSqlQuery; this.databaseCredentials = builder.databaseCredentials; this.s3StagingLocation = builder.s3StagingLocation; this.dataRearrangement = builder.dataRearrangement; this.dataSchema = builder.dataSchema; this.dataSchemaUri = builder.dataSchemaUri; } /** *

* Describes the DatabaseName and ClusterIdentifier for an Amazon Redshift * DataSource. *

* * @return Describes the DatabaseName and ClusterIdentifier for an Amazon Redshift * DataSource. */ public RedshiftDatabase databaseInformation() { return databaseInformation; } /** *

* Describes the SQL Query to execute on an Amazon Redshift database for an Amazon Redshift DataSource. *

* * @return Describes the SQL Query to execute on an Amazon Redshift database for an Amazon Redshift * DataSource. */ public String selectSqlQuery() { return selectSqlQuery; } /** *

* Describes AWS Identity and Access Management (IAM) credentials that are used connect to the Amazon Redshift * database. *

* * @return Describes AWS Identity and Access Management (IAM) credentials that are used connect to the Amazon * Redshift database. */ public RedshiftDatabaseCredentials databaseCredentials() { return databaseCredentials; } /** *

* Describes an Amazon S3 location to store the result set of the SelectSqlQuery query. *

* * @return Describes an Amazon S3 location to store the result set of the SelectSqlQuery query. */ public String s3StagingLocation() { return s3StagingLocation; } /** *

* A JSON string that represents the splitting and rearrangement processing to be applied to a * DataSource. If the DataRearrangement parameter is not provided, all of the input data * is used to create the Datasource. *

*

* There are multiple parameters that control what data is used to create a datasource: *

*
    *
  • *

    * percentBegin *

    *

    * Use percentBegin to indicate the beginning of the range of the data used to create the Datasource. * If you do not include percentBegin and percentEnd, Amazon ML includes all of the data * when creating the datasource. *

    *
  • *
  • *

    * percentEnd *

    *

    * Use percentEnd to indicate the end of the range of the data used to create the Datasource. If you do * not include percentBegin and percentEnd, Amazon ML includes all of the data when * creating the datasource. *

    *
  • *
  • *

    * complement *

    *

    * The complement parameter instructs Amazon ML to use the data that is not included in the range of * percentBegin to percentEnd to create a datasource. The complement * parameter is useful if you need to create complementary datasources for training and evaluation. To create a * complementary datasource, use the same values for percentBegin and percentEnd, along * with the complement parameter. *

    *

    * For example, the following two datasources do not share any data, and can be used to train and evaluate a model. * The first datasource has 25 percent of the data, and the second one has 75 percent of the data. *

    *

    * Datasource for evaluation: {"splitting":{"percentBegin":0, "percentEnd":25}} *

    *

    * Datasource for training: {"splitting":{"percentBegin":0, "percentEnd":25, "complement":"true"}} *

    *
  • *
  • *

    * strategy *

    *

    * To change how Amazon ML splits the data for a datasource, use the strategy parameter. *

    *

    * The default value for the strategy parameter is sequential, meaning that Amazon ML * takes all of the data records between the percentBegin and percentEnd parameters for * the datasource, in the order that the records appear in the input data. *

    *

    * The following two DataRearrangement lines are examples of sequentially ordered training and * evaluation datasources: *

    *

    * Datasource for evaluation: * {"splitting":{"percentBegin":70, "percentEnd":100, "strategy":"sequential"}} *

    *

    * Datasource for training: * {"splitting":{"percentBegin":70, "percentEnd":100, "strategy":"sequential", "complement":"true"}} *

    *

    * To randomly split the input data into the proportions indicated by the percentBegin and percentEnd parameters, * set the strategy parameter to random and provide a string that is used as the seed * value for the random data splitting (for example, you can use the S3 path to your data as the random seed * string). If you choose the random split strategy, Amazon ML assigns each row of data a pseudo-random number * between 0 and 100, and then selects the rows that have an assigned number between percentBegin and * percentEnd. Pseudo-random numbers are assigned using both the input seed string value and the byte * offset as a seed, so changing the data results in a different split. Any existing ordering is preserved. The * random splitting strategy ensures that variables in the training and evaluation data are distributed similarly. * It is useful in the cases where the input data may have an implicit sort order, which would otherwise result in * training and evaluation datasources containing non-similar data records. *

    *

    * The following two DataRearrangement lines are examples of non-sequentially ordered training and * evaluation datasources: *

    *

    * Datasource for evaluation: * {"splitting":{"percentBegin":70, "percentEnd":100, "strategy":"random", "randomSeed"="s3://my_s3_path/bucket/file.csv"}} *

    *

    * Datasource for training: * {"splitting":{"percentBegin":70, "percentEnd":100, "strategy":"random", "randomSeed"="s3://my_s3_path/bucket/file.csv", "complement":"true"}} *

    *
  • *
* * @return A JSON string that represents the splitting and rearrangement processing to be applied to a * DataSource. If the DataRearrangement parameter is not provided, all of the * input data is used to create the Datasource.

*

* There are multiple parameters that control what data is used to create a datasource: *

*
    *
  • *

    * percentBegin *

    *

    * Use percentBegin to indicate the beginning of the range of the data used to create the * Datasource. If you do not include percentBegin and percentEnd, Amazon ML * includes all of the data when creating the datasource. *

    *
  • *
  • *

    * percentEnd *

    *

    * Use percentEnd to indicate the end of the range of the data used to create the Datasource. * If you do not include percentBegin and percentEnd, Amazon ML includes all of * the data when creating the datasource. *

    *
  • *
  • *

    * complement *

    *

    * The complement parameter instructs Amazon ML to use the data that is not included in the * range of percentBegin to percentEnd to create a datasource. The * complement parameter is useful if you need to create complementary datasources for training * and evaluation. To create a complementary datasource, use the same values for percentBegin * and percentEnd, along with the complement parameter. *

    *

    * For example, the following two datasources do not share any data, and can be used to train and evaluate a * model. The first datasource has 25 percent of the data, and the second one has 75 percent of the data. *

    *

    * Datasource for evaluation: {"splitting":{"percentBegin":0, "percentEnd":25}} *

    *

    * Datasource for training: * {"splitting":{"percentBegin":0, "percentEnd":25, "complement":"true"}} *

    *
  • *
  • *

    * strategy *

    *

    * To change how Amazon ML splits the data for a datasource, use the strategy parameter. *

    *

    * The default value for the strategy parameter is sequential, meaning that Amazon * ML takes all of the data records between the percentBegin and percentEnd * parameters for the datasource, in the order that the records appear in the input data. *

    *

    * The following two DataRearrangement lines are examples of sequentially ordered training and * evaluation datasources: *

    *

    * Datasource for evaluation: * {"splitting":{"percentBegin":70, "percentEnd":100, "strategy":"sequential"}} *

    *

    * Datasource for training: * {"splitting":{"percentBegin":70, "percentEnd":100, "strategy":"sequential", "complement":"true"}} *

    *

    * To randomly split the input data into the proportions indicated by the percentBegin and percentEnd * parameters, set the strategy parameter to random and provide a string that is * used as the seed value for the random data splitting (for example, you can use the S3 path to your data * as the random seed string). If you choose the random split strategy, Amazon ML assigns each row of data a * pseudo-random number between 0 and 100, and then selects the rows that have an assigned number between * percentBegin and percentEnd. Pseudo-random numbers are assigned using both the * input seed string value and the byte offset as a seed, so changing the data results in a different split. * Any existing ordering is preserved. The random splitting strategy ensures that variables in the training * and evaluation data are distributed similarly. It is useful in the cases where the input data may have an * implicit sort order, which would otherwise result in training and evaluation datasources containing * non-similar data records. *

    *

    * The following two DataRearrangement lines are examples of non-sequentially ordered training * and evaluation datasources: *

    *

    * Datasource for evaluation: * {"splitting":{"percentBegin":70, "percentEnd":100, "strategy":"random", "randomSeed"="s3://my_s3_path/bucket/file.csv"}} *

    *

    * Datasource for training: * {"splitting":{"percentBegin":70, "percentEnd":100, "strategy":"random", "randomSeed"="s3://my_s3_path/bucket/file.csv", "complement":"true"}} *

    *
  • */ public String dataRearrangement() { return dataRearrangement; } /** *

    * A JSON string that represents the schema for an Amazon Redshift DataSource. The * DataSchema defines the structure of the observation data in the data file(s) referenced in the * DataSource. *

    *

    * A DataSchema is not required if you specify a DataSchemaUri. *

    *

    * Define your DataSchema as a series of key-value pairs. attributes and * excludedVariableNames have an array of key-value pairs for their value. Use the following format to * define your DataSchema. *

    *

    * { "version": "1.0", *

    *

    * "recordAnnotationFieldName": "F1", *

    *

    * "recordWeightFieldName": "F2", *

    *

    * "targetFieldName": "F3", *

    *

    * "dataFormat": "CSV", *

    *

    * "dataFileContainsHeader": true, *

    *

    * "attributes": [ *

    *

    * { "fieldName": "F1", "fieldType": "TEXT" }, { "fieldName": "F2", "fieldType": "NUMERIC" }, { "fieldName": "F3", * "fieldType": "CATEGORICAL" }, { "fieldName": "F4", "fieldType": "NUMERIC" }, { "fieldName": "F5", "fieldType": * "CATEGORICAL" }, { "fieldName": "F6", "fieldType": "TEXT" }, { "fieldName": "F7", "fieldType": * "WEIGHTED_INT_SEQUENCE" }, { "fieldName": "F8", "fieldType": "WEIGHTED_STRING_SEQUENCE" } ], *

    *

    * "excludedVariableNames": [ "F6" ] } *

    * * @return A JSON string that represents the schema for an Amazon Redshift DataSource. The * DataSchema defines the structure of the observation data in the data file(s) referenced in * the DataSource.

    *

    * A DataSchema is not required if you specify a DataSchemaUri. *

    *

    * Define your DataSchema as a series of key-value pairs. attributes and * excludedVariableNames have an array of key-value pairs for their value. Use the following * format to define your DataSchema. *

    *

    * { "version": "1.0", *

    *

    * "recordAnnotationFieldName": "F1", *

    *

    * "recordWeightFieldName": "F2", *

    *

    * "targetFieldName": "F3", *

    *

    * "dataFormat": "CSV", *

    *

    * "dataFileContainsHeader": true, *

    *

    * "attributes": [ *

    *

    * { "fieldName": "F1", "fieldType": "TEXT" }, { "fieldName": "F2", "fieldType": "NUMERIC" }, { "fieldName": * "F3", "fieldType": "CATEGORICAL" }, { "fieldName": "F4", "fieldType": "NUMERIC" }, { "fieldName": "F5", * "fieldType": "CATEGORICAL" }, { "fieldName": "F6", "fieldType": "TEXT" }, { "fieldName": "F7", * "fieldType": "WEIGHTED_INT_SEQUENCE" }, { "fieldName": "F8", "fieldType": "WEIGHTED_STRING_SEQUENCE" } ], *

    *

    * "excludedVariableNames": [ "F6" ] } */ public String dataSchema() { return dataSchema; } /** *

    * Describes the schema location for an Amazon Redshift DataSource. *

    * * @return Describes the schema location for an Amazon Redshift DataSource. */ public String dataSchemaUri() { return dataSchemaUri; } @Override public Builder toBuilder() { return new BuilderImpl(this); } public static Builder builder() { return new BuilderImpl(); } public static Class serializableBuilderClass() { return BuilderImpl.class; } @Override public int hashCode() { int hashCode = 1; hashCode = 31 * hashCode + Objects.hashCode(databaseInformation()); hashCode = 31 * hashCode + Objects.hashCode(selectSqlQuery()); hashCode = 31 * hashCode + Objects.hashCode(databaseCredentials()); hashCode = 31 * hashCode + Objects.hashCode(s3StagingLocation()); hashCode = 31 * hashCode + Objects.hashCode(dataRearrangement()); hashCode = 31 * hashCode + Objects.hashCode(dataSchema()); hashCode = 31 * hashCode + Objects.hashCode(dataSchemaUri()); return hashCode; } @Override public boolean equals(Object obj) { return equalsBySdkFields(obj); } @Override public boolean equalsBySdkFields(Object obj) { if (this == obj) { return true; } if (obj == null) { return false; } if (!(obj instanceof RedshiftDataSpec)) { return false; } RedshiftDataSpec other = (RedshiftDataSpec) obj; return Objects.equals(databaseInformation(), other.databaseInformation()) && Objects.equals(selectSqlQuery(), other.selectSqlQuery()) && Objects.equals(databaseCredentials(), other.databaseCredentials()) && Objects.equals(s3StagingLocation(), other.s3StagingLocation()) && Objects.equals(dataRearrangement(), other.dataRearrangement()) && Objects.equals(dataSchema(), other.dataSchema()) && Objects.equals(dataSchemaUri(), other.dataSchemaUri()); } /** * Returns a string representation of this object. This is useful for testing and debugging. Sensitive data will be * redacted from this string using a placeholder value. */ @Override public String toString() { return ToString.builder("RedshiftDataSpec").add("DatabaseInformation", databaseInformation()) .add("SelectSqlQuery", selectSqlQuery()).add("DatabaseCredentials", databaseCredentials()) .add("S3StagingLocation", s3StagingLocation()).add("DataRearrangement", dataRearrangement()) .add("DataSchema", dataSchema()).add("DataSchemaUri", dataSchemaUri()).build(); } public Optional getValueForField(String fieldName, Class clazz) { switch (fieldName) { case "DatabaseInformation": return Optional.ofNullable(clazz.cast(databaseInformation())); case "SelectSqlQuery": return Optional.ofNullable(clazz.cast(selectSqlQuery())); case "DatabaseCredentials": return Optional.ofNullable(clazz.cast(databaseCredentials())); case "S3StagingLocation": return Optional.ofNullable(clazz.cast(s3StagingLocation())); case "DataRearrangement": return Optional.ofNullable(clazz.cast(dataRearrangement())); case "DataSchema": return Optional.ofNullable(clazz.cast(dataSchema())); case "DataSchemaUri": return Optional.ofNullable(clazz.cast(dataSchemaUri())); default: return Optional.empty(); } } @Override public List> sdkFields() { return SDK_FIELDS; } private static Function getter(Function g) { return obj -> g.apply((RedshiftDataSpec) obj); } private static BiConsumer setter(BiConsumer s) { return (obj, val) -> s.accept((Builder) obj, val); } public interface Builder extends SdkPojo, CopyableBuilder { /** *

    * Describes the DatabaseName and ClusterIdentifier for an Amazon Redshift * DataSource. *

    * * @param databaseInformation * Describes the DatabaseName and ClusterIdentifier for an Amazon Redshift * DataSource. * @return Returns a reference to this object so that method calls can be chained together. */ Builder databaseInformation(RedshiftDatabase databaseInformation); /** *

    * Describes the DatabaseName and ClusterIdentifier for an Amazon Redshift * DataSource. *

    * This is a convenience that creates an instance of the {@link RedshiftDatabase.Builder} avoiding the need to * create one manually via {@link RedshiftDatabase#builder()}. * * When the {@link Consumer} completes, {@link RedshiftDatabase.Builder#build()} is called immediately and its * result is passed to {@link #databaseInformation(RedshiftDatabase)}. * * @param databaseInformation * a consumer that will call methods on {@link RedshiftDatabase.Builder} * @return Returns a reference to this object so that method calls can be chained together. * @see #databaseInformation(RedshiftDatabase) */ default Builder databaseInformation(Consumer databaseInformation) { return databaseInformation(RedshiftDatabase.builder().applyMutation(databaseInformation).build()); } /** *

    * Describes the SQL Query to execute on an Amazon Redshift database for an Amazon Redshift * DataSource. *

    * * @param selectSqlQuery * Describes the SQL Query to execute on an Amazon Redshift database for an Amazon Redshift * DataSource. * @return Returns a reference to this object so that method calls can be chained together. */ Builder selectSqlQuery(String selectSqlQuery); /** *

    * Describes AWS Identity and Access Management (IAM) credentials that are used connect to the Amazon Redshift * database. *

    * * @param databaseCredentials * Describes AWS Identity and Access Management (IAM) credentials that are used connect to the Amazon * Redshift database. * @return Returns a reference to this object so that method calls can be chained together. */ Builder databaseCredentials(RedshiftDatabaseCredentials databaseCredentials); /** *

    * Describes AWS Identity and Access Management (IAM) credentials that are used connect to the Amazon Redshift * database. *

    * This is a convenience that creates an instance of the {@link RedshiftDatabaseCredentials.Builder} avoiding * the need to create one manually via {@link RedshiftDatabaseCredentials#builder()}. * * When the {@link Consumer} completes, {@link RedshiftDatabaseCredentials.Builder#build()} is called * immediately and its result is passed to {@link #databaseCredentials(RedshiftDatabaseCredentials)}. * * @param databaseCredentials * a consumer that will call methods on {@link RedshiftDatabaseCredentials.Builder} * @return Returns a reference to this object so that method calls can be chained together. * @see #databaseCredentials(RedshiftDatabaseCredentials) */ default Builder databaseCredentials(Consumer databaseCredentials) { return databaseCredentials(RedshiftDatabaseCredentials.builder().applyMutation(databaseCredentials).build()); } /** *

    * Describes an Amazon S3 location to store the result set of the SelectSqlQuery query. *

    * * @param s3StagingLocation * Describes an Amazon S3 location to store the result set of the SelectSqlQuery query. * @return Returns a reference to this object so that method calls can be chained together. */ Builder s3StagingLocation(String s3StagingLocation); /** *

    * A JSON string that represents the splitting and rearrangement processing to be applied to a * DataSource. If the DataRearrangement parameter is not provided, all of the input * data is used to create the Datasource. *

    *

    * There are multiple parameters that control what data is used to create a datasource: *

    *
      *
    • *

      * percentBegin *

      *

      * Use percentBegin to indicate the beginning of the range of the data used to create the * Datasource. If you do not include percentBegin and percentEnd, Amazon ML includes * all of the data when creating the datasource. *

      *
    • *
    • *

      * percentEnd *

      *

      * Use percentEnd to indicate the end of the range of the data used to create the Datasource. If * you do not include percentBegin and percentEnd, Amazon ML includes all of the data * when creating the datasource. *

      *
    • *
    • *

      * complement *

      *

      * The complement parameter instructs Amazon ML to use the data that is not included in the range * of percentBegin to percentEnd to create a datasource. The complement * parameter is useful if you need to create complementary datasources for training and evaluation. To create a * complementary datasource, use the same values for percentBegin and percentEnd, * along with the complement parameter. *

      *

      * For example, the following two datasources do not share any data, and can be used to train and evaluate a * model. The first datasource has 25 percent of the data, and the second one has 75 percent of the data. *

      *

      * Datasource for evaluation: {"splitting":{"percentBegin":0, "percentEnd":25}} *

      *

      * Datasource for training: {"splitting":{"percentBegin":0, "percentEnd":25, "complement":"true"}} *

      *
    • *
    • *

      * strategy *

      *

      * To change how Amazon ML splits the data for a datasource, use the strategy parameter. *

      *

      * The default value for the strategy parameter is sequential, meaning that Amazon ML * takes all of the data records between the percentBegin and percentEnd parameters * for the datasource, in the order that the records appear in the input data. *

      *

      * The following two DataRearrangement lines are examples of sequentially ordered training and * evaluation datasources: *

      *

      * Datasource for evaluation: * {"splitting":{"percentBegin":70, "percentEnd":100, "strategy":"sequential"}} *

      *

      * Datasource for training: * {"splitting":{"percentBegin":70, "percentEnd":100, "strategy":"sequential", "complement":"true"}} *

      *

      * To randomly split the input data into the proportions indicated by the percentBegin and percentEnd * parameters, set the strategy parameter to random and provide a string that is used * as the seed value for the random data splitting (for example, you can use the S3 path to your data as the * random seed string). If you choose the random split strategy, Amazon ML assigns each row of data a * pseudo-random number between 0 and 100, and then selects the rows that have an assigned number between * percentBegin and percentEnd. Pseudo-random numbers are assigned using both the * input seed string value and the byte offset as a seed, so changing the data results in a different split. Any * existing ordering is preserved. The random splitting strategy ensures that variables in the training and * evaluation data are distributed similarly. It is useful in the cases where the input data may have an * implicit sort order, which would otherwise result in training and evaluation datasources containing * non-similar data records. *

      *

      * The following two DataRearrangement lines are examples of non-sequentially ordered training and * evaluation datasources: *

      *

      * Datasource for evaluation: * {"splitting":{"percentBegin":70, "percentEnd":100, "strategy":"random", "randomSeed"="s3://my_s3_path/bucket/file.csv"}} *

      *

      * Datasource for training: * {"splitting":{"percentBegin":70, "percentEnd":100, "strategy":"random", "randomSeed"="s3://my_s3_path/bucket/file.csv", "complement":"true"}} *

      *
    • *
    * * @param dataRearrangement * A JSON string that represents the splitting and rearrangement processing to be applied to a * DataSource. If the DataRearrangement parameter is not provided, all of the * input data is used to create the Datasource.

    *

    * There are multiple parameters that control what data is used to create a datasource: *

    *
      *
    • *

      * percentBegin *

      *

      * Use percentBegin to indicate the beginning of the range of the data used to create the * Datasource. If you do not include percentBegin and percentEnd, Amazon ML * includes all of the data when creating the datasource. *

      *
    • *
    • *

      * percentEnd *

      *

      * Use percentEnd to indicate the end of the range of the data used to create the * Datasource. If you do not include percentBegin and percentEnd, Amazon ML * includes all of the data when creating the datasource. *

      *
    • *
    • *

      * complement *

      *

      * The complement parameter instructs Amazon ML to use the data that is not included in the * range of percentBegin to percentEnd to create a datasource. The * complement parameter is useful if you need to create complementary datasources for * training and evaluation. To create a complementary datasource, use the same values for * percentBegin and percentEnd, along with the complement * parameter. *

      *

      * For example, the following two datasources do not share any data, and can be used to train and * evaluate a model. The first datasource has 25 percent of the data, and the second one has 75 percent * of the data. *

      *

      * Datasource for evaluation: {"splitting":{"percentBegin":0, "percentEnd":25}} *

      *

      * Datasource for training: * {"splitting":{"percentBegin":0, "percentEnd":25, "complement":"true"}} *

      *
    • *
    • *

      * strategy *

      *

      * To change how Amazon ML splits the data for a datasource, use the strategy parameter. *

      *

      * The default value for the strategy parameter is sequential, meaning that * Amazon ML takes all of the data records between the percentBegin and * percentEnd parameters for the datasource, in the order that the records appear in the * input data. *

      *

      * The following two DataRearrangement lines are examples of sequentially ordered training * and evaluation datasources: *

      *

      * Datasource for evaluation: * {"splitting":{"percentBegin":70, "percentEnd":100, "strategy":"sequential"}} *

      *

      * Datasource for training: * {"splitting":{"percentBegin":70, "percentEnd":100, "strategy":"sequential", "complement":"true"}} *

      *

      * To randomly split the input data into the proportions indicated by the percentBegin and percentEnd * parameters, set the strategy parameter to random and provide a string that * is used as the seed value for the random data splitting (for example, you can use the S3 path to your * data as the random seed string). If you choose the random split strategy, Amazon ML assigns each row * of data a pseudo-random number between 0 and 100, and then selects the rows that have an assigned * number between percentBegin and percentEnd. Pseudo-random numbers are * assigned using both the input seed string value and the byte offset as a seed, so changing the data * results in a different split. Any existing ordering is preserved. The random splitting strategy * ensures that variables in the training and evaluation data are distributed similarly. It is useful in * the cases where the input data may have an implicit sort order, which would otherwise result in * training and evaluation datasources containing non-similar data records. *

      *

      * The following two DataRearrangement lines are examples of non-sequentially ordered * training and evaluation datasources: *

      *

      * Datasource for evaluation: * {"splitting":{"percentBegin":70, "percentEnd":100, "strategy":"random", "randomSeed"="s3://my_s3_path/bucket/file.csv"}} *

      *

      * Datasource for training: * {"splitting":{"percentBegin":70, "percentEnd":100, "strategy":"random", "randomSeed"="s3://my_s3_path/bucket/file.csv", "complement":"true"}} *

      *
    • * @return Returns a reference to this object so that method calls can be chained together. */ Builder dataRearrangement(String dataRearrangement); /** *

      * A JSON string that represents the schema for an Amazon Redshift DataSource. The * DataSchema defines the structure of the observation data in the data file(s) referenced in the * DataSource. *

      *

      * A DataSchema is not required if you specify a DataSchemaUri. *

      *

      * Define your DataSchema as a series of key-value pairs. attributes and * excludedVariableNames have an array of key-value pairs for their value. Use the following format * to define your DataSchema. *

      *

      * { "version": "1.0", *

      *

      * "recordAnnotationFieldName": "F1", *

      *

      * "recordWeightFieldName": "F2", *

      *

      * "targetFieldName": "F3", *

      *

      * "dataFormat": "CSV", *

      *

      * "dataFileContainsHeader": true, *

      *

      * "attributes": [ *

      *

      * { "fieldName": "F1", "fieldType": "TEXT" }, { "fieldName": "F2", "fieldType": "NUMERIC" }, { "fieldName": * "F3", "fieldType": "CATEGORICAL" }, { "fieldName": "F4", "fieldType": "NUMERIC" }, { "fieldName": "F5", * "fieldType": "CATEGORICAL" }, { "fieldName": "F6", "fieldType": "TEXT" }, { "fieldName": "F7", "fieldType": * "WEIGHTED_INT_SEQUENCE" }, { "fieldName": "F8", "fieldType": "WEIGHTED_STRING_SEQUENCE" } ], *

      *

      * "excludedVariableNames": [ "F6" ] } *

      * * @param dataSchema * A JSON string that represents the schema for an Amazon Redshift DataSource. The * DataSchema defines the structure of the observation data in the data file(s) referenced * in the DataSource.

      *

      * A DataSchema is not required if you specify a DataSchemaUri. *

      *

      * Define your DataSchema as a series of key-value pairs. attributes and * excludedVariableNames have an array of key-value pairs for their value. Use the following * format to define your DataSchema. *

      *

      * { "version": "1.0", *

      *

      * "recordAnnotationFieldName": "F1", *

      *

      * "recordWeightFieldName": "F2", *

      *

      * "targetFieldName": "F3", *

      *

      * "dataFormat": "CSV", *

      *

      * "dataFileContainsHeader": true, *

      *

      * "attributes": [ *

      *

      * { "fieldName": "F1", "fieldType": "TEXT" }, { "fieldName": "F2", "fieldType": "NUMERIC" }, { * "fieldName": "F3", "fieldType": "CATEGORICAL" }, { "fieldName": "F4", "fieldType": "NUMERIC" }, { * "fieldName": "F5", "fieldType": "CATEGORICAL" }, { "fieldName": "F6", "fieldType": "TEXT" }, { * "fieldName": "F7", "fieldType": "WEIGHTED_INT_SEQUENCE" }, { "fieldName": "F8", "fieldType": * "WEIGHTED_STRING_SEQUENCE" } ], *

      *

      * "excludedVariableNames": [ "F6" ] } * @return Returns a reference to this object so that method calls can be chained together. */ Builder dataSchema(String dataSchema); /** *

      * Describes the schema location for an Amazon Redshift DataSource. *

      * * @param dataSchemaUri * Describes the schema location for an Amazon Redshift DataSource. * @return Returns a reference to this object so that method calls can be chained together. */ Builder dataSchemaUri(String dataSchemaUri); } static final class BuilderImpl implements Builder { private RedshiftDatabase databaseInformation; private String selectSqlQuery; private RedshiftDatabaseCredentials databaseCredentials; private String s3StagingLocation; private String dataRearrangement; private String dataSchema; private String dataSchemaUri; private BuilderImpl() { } private BuilderImpl(RedshiftDataSpec model) { databaseInformation(model.databaseInformation); selectSqlQuery(model.selectSqlQuery); databaseCredentials(model.databaseCredentials); s3StagingLocation(model.s3StagingLocation); dataRearrangement(model.dataRearrangement); dataSchema(model.dataSchema); dataSchemaUri(model.dataSchemaUri); } public final RedshiftDatabase.Builder getDatabaseInformation() { return databaseInformation != null ? databaseInformation.toBuilder() : null; } @Override public final Builder databaseInformation(RedshiftDatabase databaseInformation) { this.databaseInformation = databaseInformation; return this; } public final void setDatabaseInformation(RedshiftDatabase.BuilderImpl databaseInformation) { this.databaseInformation = databaseInformation != null ? databaseInformation.build() : null; } public final String getSelectSqlQuery() { return selectSqlQuery; } @Override public final Builder selectSqlQuery(String selectSqlQuery) { this.selectSqlQuery = selectSqlQuery; return this; } public final void setSelectSqlQuery(String selectSqlQuery) { this.selectSqlQuery = selectSqlQuery; } public final RedshiftDatabaseCredentials.Builder getDatabaseCredentials() { return databaseCredentials != null ? databaseCredentials.toBuilder() : null; } @Override public final Builder databaseCredentials(RedshiftDatabaseCredentials databaseCredentials) { this.databaseCredentials = databaseCredentials; return this; } public final void setDatabaseCredentials(RedshiftDatabaseCredentials.BuilderImpl databaseCredentials) { this.databaseCredentials = databaseCredentials != null ? databaseCredentials.build() : null; } public final String getS3StagingLocation() { return s3StagingLocation; } @Override public final Builder s3StagingLocation(String s3StagingLocation) { this.s3StagingLocation = s3StagingLocation; return this; } public final void setS3StagingLocation(String s3StagingLocation) { this.s3StagingLocation = s3StagingLocation; } public final String getDataRearrangement() { return dataRearrangement; } @Override public final Builder dataRearrangement(String dataRearrangement) { this.dataRearrangement = dataRearrangement; return this; } public final void setDataRearrangement(String dataRearrangement) { this.dataRearrangement = dataRearrangement; } public final String getDataSchema() { return dataSchema; } @Override public final Builder dataSchema(String dataSchema) { this.dataSchema = dataSchema; return this; } public final void setDataSchema(String dataSchema) { this.dataSchema = dataSchema; } public final String getDataSchemaUri() { return dataSchemaUri; } @Override public final Builder dataSchemaUri(String dataSchemaUri) { this.dataSchemaUri = dataSchemaUri; return this; } public final void setDataSchemaUri(String dataSchemaUri) { this.dataSchemaUri = dataSchemaUri; } @Override public RedshiftDataSpec build() { return new RedshiftDataSpec(this); } @Override public List> sdkFields() { return SDK_FIELDS; } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy