![JAR search and dependency download from the Maven repository](/logo.png)
io.cdap.plugin.gcp.bigtable.sink.BigtableSink Maven / Gradle / Ivy
/*
* Copyright © 2019-2020 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package io.cdap.plugin.gcp.bigtable.sink;
import com.google.cloud.bigtable.hbase.BigtableConfiguration;
import com.google.cloud.bigtable.hbase.BigtableOptionsFactory;
import com.google.common.collect.ImmutableSet;
import io.cdap.cdap.api.annotation.Description;
import io.cdap.cdap.api.annotation.Name;
import io.cdap.cdap.api.annotation.Plugin;
import io.cdap.cdap.api.annotation.Requirements;
import io.cdap.cdap.api.data.batch.Output;
import io.cdap.cdap.api.data.format.StructuredRecord;
import io.cdap.cdap.api.data.schema.Schema;
import io.cdap.cdap.api.dataset.lib.KeyValue;
import io.cdap.cdap.etl.api.Emitter;
import io.cdap.cdap.etl.api.FailureCollector;
import io.cdap.cdap.etl.api.PipelineConfigurer;
import io.cdap.cdap.etl.api.StageConfigurer;
import io.cdap.cdap.etl.api.batch.BatchRuntimeContext;
import io.cdap.cdap.etl.api.batch.BatchSink;
import io.cdap.cdap.etl.api.batch.BatchSinkContext;
import io.cdap.plugin.common.ConfigUtil;
import io.cdap.plugin.common.LineageRecorder;
import io.cdap.plugin.gcp.bigtable.common.HBaseColumn;
import io.cdap.plugin.gcp.common.SourceOutputFormatProvider;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Admin;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.Mutation;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat;
import org.apache.hadoop.hbase.util.Bytes;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
/**
* A {@link BatchSink} that writes data to Cloud Bigtable.
* This plugin takes a {@link StructuredRecord} in, converts it to {@link Put} mutation, and writes it to the
* Cloud Bigtable instance.
*/
@Plugin(type = BatchSink.PLUGIN_TYPE)
@Name(BigtableSink.NAME)
@Description("This sink writes data to Google Cloud Bigtable. " +
"Cloud Bigtable is Google's NoSQL Big Data database service.")
public final class BigtableSink extends BatchSink {
private static final Logger LOG = LoggerFactory.getLogger(BigtableSink.class);
public static final String NAME = "Bigtable";
private static final Set SUPPORTED_FIELD_TYPES = ImmutableSet.of(
Schema.Type.BOOLEAN,
Schema.Type.INT,
Schema.Type.LONG,
Schema.Type.FLOAT,
Schema.Type.DOUBLE,
Schema.Type.BYTES,
Schema.Type.STRING
);
private final BigtableSinkConfig config;
private RecordToHBaseMutationTransformer transformer;
public BigtableSink(BigtableSinkConfig config) {
this.config = config;
}
@Override
public void configurePipeline(PipelineConfigurer configurer) {
super.configurePipeline(configurer);
StageConfigurer stageConfigurer = configurer.getStageConfigurer();
FailureCollector collector = stageConfigurer.getFailureCollector();
config.validate(collector);
Schema inputSchema = stageConfigurer.getInputSchema();
if (inputSchema != null) {
validateInputSchema(inputSchema, collector);
}
if (config.connectionParamsConfigured()) {
Configuration conf = getConfiguration();
try (Connection connection = BigtableConfiguration.connect(conf);
Admin admin = connection.getAdmin()) {
TableName tableName = TableName.valueOf(config.table);
if (admin.tableExists(tableName)) {
validateExistingTable(connection, tableName, collector);
}
} catch (IOException e) {
// Don't fail deployments due to connect failures
LOG.warn("Failed to connect to BigTable.", e);
}
}
}
@Override
public void prepareRun(BatchSinkContext context) {
FailureCollector collector = context.getFailureCollector();
config.validate(collector);
Configuration conf = getConfiguration();
try (Connection connection = BigtableConfiguration.connect(conf);
Admin admin = connection.getAdmin()) {
TableName tableName = TableName.valueOf(config.table);
if (admin.tableExists(tableName)) {
validateExistingTable(connection, tableName, collector);
} else {
createTable(connection, tableName, collector);
}
} catch (IOException e) {
collector.addFailure(
String.format("Failed to connect to Bigtable : %s", e.getMessage()), null)
.withConfigProperty(BigtableSinkConfig.BIGTABLE_OPTIONS)
.withStacktrace(e.getStackTrace());
}
collector.getOrThrowException();
// Both emitLineage and setOutputFormat internally try to create an external dataset if it does not already exists.
// We call emitLineage before since it creates the dataset with schema.
emitLineage(context);
context.addOutput(Output.of(config.getReferenceName(),
new SourceOutputFormatProvider(BigtableOutputFormat.class, conf)));
}
@Override
public void initialize(BatchRuntimeContext context) throws Exception {
super.initialize(context);
FailureCollector collector = context.getFailureCollector();
Map columnMappings = config.getColumnMappings(collector);
transformer = new RecordToHBaseMutationTransformer(config.keyAlias, columnMappings);
}
@Override
public void transform(StructuredRecord record, Emitter> emitter) {
Mutation mutation = transformer.transform(record);
emitter.emit(new KeyValue<>(null, mutation));
}
private Configuration getConfiguration() {
Configuration conf = new Configuration();
String serviceAccount = config.getServiceAccount();
if (serviceAccount != null) {
conf.setBoolean(BigtableOptionsFactory.BIGTABLE_USE_SERVICE_ACCOUNTS_KEY,
BigtableOptionsFactory.BIGTABLE_USE_SERVICE_ACCOUNTS_DEFAULT);
if (config.isServiceAccountFilePath()) {
conf.set(BigtableOptionsFactory.BIGTABLE_SERVICE_ACCOUNT_JSON_KEYFILE_LOCATION_KEY, serviceAccount);
} else {
conf.set(BigtableOptionsFactory.BIGTABLE_SERVICE_ACCOUNT_JSON_VALUE_KEY, serviceAccount);
}
}
BigtableConfiguration.configure(conf, config.getProject(), config.instance);
conf.set(TableOutputFormat.OUTPUT_TABLE, config.table);
config.getBigtableOptions().forEach(conf::set);
return conf;
}
private void validateInputSchema(Schema inputSchema, FailureCollector collector) {
if (!config.containsMacro(BigtableSinkConfig.KEY_ALIAS) && inputSchema.getField(config.keyAlias) == null) {
collector.addFailure(
String.format("Field '%s' declared as key alias does not exist in input schema.", config.keyAlias),
"Specify input field name as key alias.").withConfigProperty(BigtableSinkConfig.KEY_ALIAS);
}
List fields = inputSchema.getFields();
if (fields == null || fields.isEmpty()) {
collector.addFailure("Input schema must contain fields.", null);
throw collector.getOrThrowException();
}
Set fieldNames = new HashSet<>();
for (Schema.Field field : fields) {
fieldNames.add(field.getName());
Schema nonNullableSchema = field.getSchema().isNullable() ?
field.getSchema().getNonNullable() : field.getSchema();
if (!SUPPORTED_FIELD_TYPES.contains(nonNullableSchema.getType()) ||
(nonNullableSchema.getLogicalType() != Schema.LogicalType.DATETIME &&
nonNullableSchema.getLogicalType() != null)) {
String supportedTypes = SUPPORTED_FIELD_TYPES.stream()
.map(Enum::name)
.map(String::toLowerCase)
.collect(Collectors.joining(", "));
String errorMessage = String.format("Field '%s' is of unsupported type '%s'.",
field.getName(), nonNullableSchema.getDisplayName());
collector.addFailure(errorMessage, String.format("Supported types are: datetime, %s.", supportedTypes))
.withInputSchemaField(field.getName());
}
}
config.getColumnMappings(collector).keySet().forEach(column -> {
if (!fieldNames.contains(column)) {
collector.addFailure(
String.format("Column '%s' in column mappings does not exist in the input schema.", column),
String.format("Remove or modify column '%s' from column mappings.", column));
}
});
}
private void createTable(Connection connection, TableName tableName, FailureCollector collector) {
try (Admin admin = connection.getAdmin()) {
HTableDescriptor tableDescriptor = new HTableDescriptor(tableName);
config.getColumnMappings(collector)
.values()
.stream()
.map(HBaseColumn::getFamily)
.distinct()
.map(HColumnDescriptor::new)
.forEach(tableDescriptor::addFamily);
admin.createTable(tableDescriptor);
} catch (IOException e) {
collector.addFailure(
String.format("Failed to create table '%s' in Bigtable : %s", tableName, e.getMessage()), null)
.withConfigProperty(BigtableSinkConfig.TABLE)
.withStacktrace(e.getStackTrace());
}
}
private void validateExistingTable(Connection connection, TableName tableName, FailureCollector collector)
throws IOException {
try (Table table = connection.getTable(tableName)) {
Set existingFamilies = table.getTableDescriptor()
.getFamiliesKeys()
.stream()
.map(Bytes::toString)
.collect(Collectors.toSet());
for (Map.Entry entry : config.getColumnMappings(collector).entrySet()) {
String family = entry.getValue().getFamily();
if (!existingFamilies.contains(family)) {
collector.addFailure(
String.format("Column family '%s' does not exist in target table '%s'.", family, config.table),
String.format("Remove column family %s.", family))
.withConfigElement(BigtableSinkConfig.COLUMN_MAPPINGS,
ConfigUtil.getKVPair(entry.getKey(), entry.getValue().getQualifiedName(), "="));
}
}
}
}
private void emitLineage(BatchSinkContext context) {
Schema inputSchema = context.getInputSchema();
LineageRecorder lineageRecorder = new LineageRecorder(context, config.getReferenceName());
lineageRecorder.createExternalDataset(inputSchema);
if (inputSchema != null) {
List fields = inputSchema.getFields();
if (fields != null) {
List fieldNames = fields.stream()
.map(Schema.Field::getName)
.collect(Collectors.toList());
String operationDescription = String.format("Wrote to Bigtable. Project: '%s', Instance: '%s'. Table: '%s'",
config.getProject(), config.instance, config.table);
lineageRecorder.recordWrite("Write", operationDescription, fieldNames);
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy