All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.cdap.plugin.gcp.datastore.source.DatastoreSource Maven / Gradle / Ivy

/*
 * Copyright © 2019 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package io.cdap.plugin.gcp.datastore.source;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableMap;
import com.google.datastore.v1.Entity;
import com.google.datastore.v1.EntityResult;
import com.google.datastore.v1.Key;
import com.google.datastore.v1.KindExpression;
import com.google.datastore.v1.PartitionId;
import com.google.datastore.v1.Query;
import com.google.datastore.v1.RunQueryRequest;
import com.google.datastore.v1.Value;
import com.google.datastore.v1.client.Datastore;
import com.google.datastore.v1.client.DatastoreException;
import com.google.datastore.v1.client.DatastoreHelper;
import com.google.protobuf.Int32Value;
import io.cdap.cdap.api.annotation.Description;
import io.cdap.cdap.api.annotation.Name;
import io.cdap.cdap.api.annotation.Plugin;
import io.cdap.cdap.api.data.batch.Input;
import io.cdap.cdap.api.data.format.StructuredRecord;
import io.cdap.cdap.api.data.schema.Schema;
import io.cdap.cdap.api.dataset.lib.KeyValue;
import io.cdap.cdap.etl.api.Emitter;
import io.cdap.cdap.etl.api.FailureCollector;
import io.cdap.cdap.etl.api.PipelineConfigurer;
import io.cdap.cdap.etl.api.StageConfigurer;
import io.cdap.cdap.etl.api.batch.BatchRuntimeContext;
import io.cdap.cdap.etl.api.batch.BatchSource;
import io.cdap.cdap.etl.api.batch.BatchSourceContext;
import io.cdap.plugin.common.LineageRecorder;
import io.cdap.plugin.gcp.datastore.source.util.DatastoreSourceConstants;
import io.cdap.plugin.gcp.datastore.util.DatastoreUtil;
import org.apache.hadoop.io.NullWritable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;
import javax.annotation.Nullable;

/**
 * Batch Datastore Source Plugin reads the data from Google Cloud Datastore.
 */
@Plugin(type = BatchSource.PLUGIN_TYPE)
@Name(DatastoreSource.NAME)
@Description("Google Cloud Datastore is a NoSQL document database built for automatic scaling and high performance. "
  + "Source plugin provides ability to read data from it by Kind with various filters usage.")
public class DatastoreSource extends BatchSource {

  private static final Logger LOG = LoggerFactory.getLogger(DatastoreSource.class);
  public static final String NAME = "Datastore";

  private static final Map SUPPORTED_SIMPLE_TYPES =
    new ImmutableMap.Builder()
    .put(Value.ValueTypeCase.STRING_VALUE, Schema.of(Schema.Type.STRING))
    .put(Value.ValueTypeCase.INTEGER_VALUE, Schema.of(Schema.Type.LONG))
    .put(Value.ValueTypeCase.DOUBLE_VALUE, Schema.of(Schema.Type.DOUBLE))
    .put(Value.ValueTypeCase.BOOLEAN_VALUE, Schema.of(Schema.Type.BOOLEAN))
    .put(Value.ValueTypeCase.TIMESTAMP_VALUE, Schema.of(Schema.LogicalType.TIMESTAMP_MICROS))
    .put(Value.ValueTypeCase.BLOB_VALUE, Schema.of(Schema.Type.BYTES))
    .put(Value.ValueTypeCase.NULL_VALUE, Schema.of(Schema.Type.NULL))
    .build();

  private final DatastoreSourceConfig config;
  private EntityToRecordTransformer entityToRecordTransformer;

  public DatastoreSource(DatastoreSourceConfig config) {
    this.config = config;
  }

  @Override
  public void configurePipeline(PipelineConfigurer pipelineConfigurer) {
    LOG.debug("Validate config during `configurePipeline` stage: {}", config);
    StageConfigurer stageConfigurer = pipelineConfigurer.getStageConfigurer();
    FailureCollector collector = stageConfigurer.getFailureCollector();
    config.validate(collector);
    // Since we have validated all the properties, throw an exception if there are any errors in the collector. This is
    // to avoid adding same validation errors again in getSchema method call
    collector.getOrThrowException();

    Schema configuredSchema = config.getSchema(collector);
    if (!config.shouldConnect()) {
      stageConfigurer.setOutputSchema(configuredSchema);
      return;
    }

    if (configuredSchema == null) {
      configuredSchema = getSchema(collector);
      stageConfigurer.setOutputSchema(configuredSchema);
      return;
    }

    pipelineConfigurer.getStageConfigurer().setOutputSchema(configuredSchema);
  }

  @Override
  public void prepareRun(BatchSourceContext batchSourceContext) {
    LOG.debug("Validate config during `prepareRun` stage: {}", config);
    FailureCollector collector = batchSourceContext.getFailureCollector();
    config.validate(collector);
    collector.getOrThrowException();

    String project = config.getProject();
    String serviceAccount = config.getServiceAccount();

    String namespace = config.getNamespace();
    String kind = config.getKind();
    String pbQuery = config.constructPbQuery(collector).toString();
    String splits = String.valueOf(config.getNumSplits());

    batchSourceContext.setInput(
      Input.of(config.getReferenceName(),
               new DatastoreInputFormatProvider(project, serviceAccount, config.isServiceAccountFilePath(), namespace,
                                                kind, pbQuery, splits)));

    Schema schema = batchSourceContext.getOutputSchema();
    LineageRecorder lineageRecorder = new LineageRecorder(batchSourceContext, config.getReferenceName());
    lineageRecorder.createExternalDataset(schema);
    lineageRecorder.recordRead("Read", "Read from Cloud Datastore.",
                               Objects.requireNonNull(schema.getFields()).stream()
                                 .map(Schema.Field::getName)
                                 .collect(Collectors.toList()));
  }

  @Override
  public void initialize(BatchRuntimeContext context) throws Exception {
    super.initialize(context);
    entityToRecordTransformer = new EntityToRecordTransformer(context.getOutputSchema(),
                                                              config.getKeyType(context.getFailureCollector()),
                                                              config.getKeyAlias());
  }

  @Override
  public void transform(KeyValue input,
                        Emitter emitter) {
    Entity entity = input.getValue();
    StructuredRecord record = entityToRecordTransformer.transformEntity(entity);
    emitter.emit(record);
  }

  private Schema getSchema(FailureCollector collector) {
    Query.Builder queryBuilder = Query.newBuilder()
      .addKind(KindExpression.newBuilder().setName(config.getKind()).build())
      .setLimit(Int32Value.of(1));

    Key ancestorKey = constructAncestorKey(config, collector);
    if (ancestorKey != null) {
      queryBuilder.setFilter(DatastoreHelper.makeAncestorFilter(ancestorKey).build());
    }
    Query query = queryBuilder.build();
    LOG.debug("Executing query for `Get Schema`: {}", query);

    Datastore datastore = DatastoreUtil.getDatastoreV1(config.getServiceAccount(), config.isServiceAccountFilePath(),
                                                       config.getProject());
    Iterator results;
    RunQueryRequest request = RunQueryRequest.newBuilder()
      .setQuery(query)
      .setPartitionId(PartitionId.newBuilder()
                        .setNamespaceId(config.getNamespace())
                        .setProjectId(config.getProject()))
      .build();

    try {
      results = datastore.runQuery(request).getBatch().getEntityResultsList().iterator();
    } catch (DatastoreException e) {
      collector.addFailure("Unable to fetch data from Datastore: " + e.getMessage(), null)
        .withStacktrace(e.getStackTrace());
      throw collector.getOrThrowException();
    }

    if (results.hasNext()) {
      Entity entity = results.next().getEntity();
      return constructSchema(entity, config.isIncludeKey(collector), config.getKeyAlias());
    }

    collector.addFailure("Cloud Datastore query did not return any results. ",
                         "Ensure Namespace, Kind and Ancestor properties are correct.")
      .withConfigProperty(DatastoreSourceConstants.PROPERTY_NAMESPACE)
      .withConfigProperty(DatastoreSourceConstants.PROPERTY_KIND)
      .withConfigProperty(DatastoreSourceConstants.PROPERTY_ANCESTOR);

    throw collector.getOrThrowException();
  }

  /**
   * Constructs ancestor key using using given Datastore configuration.
   *
   * @param config Datastore configuration
   * @param collector failure collector
   * @return Datastore key instance
   */
  @VisibleForTesting
  @Nullable
  Key constructAncestorKey(DatastoreSourceConfig config, FailureCollector collector) {
    List ancestor = config.getAncestor(collector);

    if (ancestor.size() <= 1) {
      return null;
    }

    Key.PathElement keyElement = ancestor.get(ancestor.size() - 1);
    Key.Builder keyBuilder = Key.newBuilder()
      .setPartitionId(PartitionId.newBuilder()
                        .setProjectId(config.getProject())
                        .setNamespaceId(config.getNamespace()));

    ancestor.subList(0, ancestor.size() - 1).forEach(keyBuilder::addPath);

    if (keyElement.getIdTypeCase() == Key.PathElement.IdTypeCase.ID) {
      keyBuilder.addPath(Key.PathElement.newBuilder()
                           .setId(keyElement.getId())
                           .setKind(keyElement.getKind()).build());

    } else {
      keyBuilder.addPath(Key.PathElement.newBuilder()
                           .setName(keyElement.getName())
                           .setKind(keyElement.getKind()).build());

    }
    return keyBuilder.build();
  }


  /**
   * Constructs CDAP schema based on given CDAP entity and source configuration,
   * will add Datastore key to the list of schema fields if config include key flag is set to true.
   *
   * @param entity Datastore entity
   * @param isIncludeKey flag that indicates that key should be included in schema
   * @param keyName key name
   * @return CDAP schema
   */
  @VisibleForTesting
  Schema constructSchema(Entity entity, boolean isIncludeKey, String keyName) {
    List fields = constructSchemaFields(entity);

    if (isIncludeKey) {
      fields.add(Schema.Field.of(keyName, Schema.of(Schema.Type.STRING)));
    }

    return Schema.recordOf("schema", fields);
  }

  /**
   * Constructs list of CDAP schema fields based on given Datastore entity,
   * filters out fields schemas with null value.
   *
   * @param entity Datastore entity
   * @return list of CDAP schema fields
   */
  private List constructSchemaFields(Entity entity) {
    return entity.getPropertiesMap().entrySet().stream()
      .map(entry -> transformToField(entry.getKey(), entry.getValue()))
      .filter(Objects::nonNull)
      .collect(Collectors.toList());
  }

  /**
   * Since Datastore is schemaless database, creates field with nullable schema for the given value
   * based on its value type, for unsupported types returns null.
   *
   * @param name field name
   * @param value Datastore value
   * @return CDAP field
   */
  //private Schema.Field transformToField(String name, Value value) {
  private Schema.Field transformToField(String name, Value value) {
    Schema schema = createSchema(name, value);
    if (schema == null) {
      return null;
    }
    return Schema.Type.NULL == schema.getType()
      ? Schema.Field.of(name, schema)
      : Schema.Field.of(name, Schema.nullableOf(schema));
  }

  /**
   * Creates CDAP schema based on given Datastore value and its type,
   * for unsupported types will return null.
   *
   * @param name field name
   * @param value Datastore value
   * @return CDAP schema
   */
  private Schema createSchema(String name, Value value) {
    Schema schema = SUPPORTED_SIMPLE_TYPES.get(value.getValueTypeCase());

    if (schema != null) {
      return schema;
    }

    switch (value.getValueTypeCase()) {
      case ENTITY_VALUE:
        List fields = constructSchemaFields(value.getEntityValue());
        return Schema.recordOf(name, fields);
      case ARRAY_VALUE:
        @SuppressWarnings("unchecked")
        List values = value.getArrayValue().getValuesList();
        Set arraySchemas = new HashSet<>();
        for (Value val: values) {
          Schema valSchema = createSchema(name, val);
          if (valSchema == null) {
            return null;
          }
          arraySchemas.add(valSchema);
        }

        if (arraySchemas.isEmpty()) {
          return Schema.arrayOf(Schema.of(Schema.Type.NULL));
        }

        if (arraySchemas.size() == 1) {
          Schema componentSchema = arraySchemas.iterator().next();
          return Schema.Type.NULL == componentSchema.getType()
            ? Schema.arrayOf(componentSchema)
            : Schema.arrayOf(Schema.nullableOf(componentSchema));
        }

        LOG.debug("Field '{}' has several schemas in array, add them as union of schemas "
                    + "plus {} schema for null values", name, Schema.Type.NULL);
        arraySchemas.add(Schema.of(Schema.Type.NULL));
        return Schema.arrayOf(Schema.unionOf(arraySchemas));
    }

    LOG.debug("Field '{}' is of unsupported type '{}', skipping field from the schema", name, value.getValueTypeCase());
    return null;
  }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy