io.cdap.plugin.gcp.datastore.sink.DatastoreSink Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of google-cloud Show documentation
Show all versions of google-cloud Show documentation
Plugins for Google Big Query
/*
* Copyright © 2019 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package io.cdap.plugin.gcp.datastore.sink;
import com.google.datastore.v1.Entity;
import io.cdap.cdap.api.annotation.Description;
import io.cdap.cdap.api.annotation.Name;
import io.cdap.cdap.api.annotation.Plugin;
import io.cdap.cdap.api.annotation.Requirements;
import io.cdap.cdap.api.data.batch.Output;
import io.cdap.cdap.api.data.format.StructuredRecord;
import io.cdap.cdap.api.data.schema.Schema;
import io.cdap.cdap.api.dataset.lib.KeyValue;
import io.cdap.cdap.etl.api.Emitter;
import io.cdap.cdap.etl.api.FailureCollector;
import io.cdap.cdap.etl.api.PipelineConfigurer;
import io.cdap.cdap.etl.api.StageConfigurer;
import io.cdap.cdap.etl.api.batch.BatchRuntimeContext;
import io.cdap.cdap.etl.api.batch.BatchSink;
import io.cdap.cdap.etl.api.batch.BatchSinkContext;
import io.cdap.plugin.common.LineageRecorder;
import org.apache.hadoop.io.NullWritable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.stream.Collectors;
/**
* A {@link BatchSink} that writes data to Cloud Datastore.
* This {@link DatastoreSink} takes a {@link StructuredRecord} in, converts it to Entity, and writes it to the
* Cloud Datastore kind.
*/
@Plugin(type = BatchSink.PLUGIN_TYPE)
@Name(DatastoreSink.PLUGIN_NAME)
@Description("CDAP Google Cloud Datastore Batch Sink takes the structured record from the input source and writes "
+ "to Google Cloud Datastore.")
@Requirements(capabilities = "bypass_cmek_check")
public class DatastoreSink extends BatchSink {
private static final Logger LOG = LoggerFactory.getLogger(DatastoreSink.class);
public static final String PLUGIN_NAME = "Datastore";
private final DatastoreSinkConfig config;
private RecordToEntityTransformer recordToEntityTransformer;
public DatastoreSink(DatastoreSinkConfig config) {
this.config = config;
}
@Override
public void configurePipeline(PipelineConfigurer pipelineConfigurer) {
super.configurePipeline(pipelineConfigurer);
StageConfigurer configurer = pipelineConfigurer.getStageConfigurer();
Schema inputSchema = configurer.getInputSchema();
FailureCollector collector = configurer.getFailureCollector();
config.validate(inputSchema, collector);
}
@SuppressWarnings("ConstantConditions")
@Override
public void prepareRun(BatchSinkContext context) {
Schema inputSchema = context.getInputSchema();
LOG.debug("DatastoreSink `prepareRun` input schema: {}", inputSchema);
FailureCollector collector = context.getFailureCollector();
config.validate(inputSchema, collector);
collector.getOrThrowException();
String project = config.getProject();
String serviceAccount = config.getServiceAccount();
String shouldAutoGenerateKey = Boolean.toString(config.shouldUseAutoGeneratedKey(collector));
String batchSize = Integer.toString(config.getBatchSize());
String shouldUseTransactions = Boolean.toString(config.shouldUseTransactions());
context.addOutput(Output.of(config.getReferenceName(),
new DatastoreOutputFormatProvider(project, serviceAccount,
config.isServiceAccountFilePath(),
shouldAutoGenerateKey, batchSize,
shouldUseTransactions)));
LineageRecorder lineageRecorder = new LineageRecorder(context, config.getReferenceName());
lineageRecorder.createExternalDataset(inputSchema);
// Record the field level WriteOperation
lineageRecorder.recordWrite("Write", "Wrote to Cloud Datastore sink",
inputSchema.getFields().stream()
.map(Schema.Field::getName)
.collect(Collectors.toList()));
}
@Override
public void initialize(BatchRuntimeContext context) throws Exception {
super.initialize(context);
FailureCollector collector = context.getFailureCollector();
this.recordToEntityTransformer = new RecordToEntityTransformer(config.getProject(),
config.getNamespace(),
config.getKind(),
config.getKeyType(collector),
config.getKeyAlias(),
config.getAncestor(collector),
config.getIndexStrategy(collector),
config.getIndexedProperties());
}
@Override
public void transform(StructuredRecord record, Emitter> emitter) {
Entity entity = recordToEntityTransformer.transformStructuredRecord(record);
emitter.emit(new KeyValue<>(null, entity));
}
}