io.cdap.plugin.gcp.spanner.sink.SpannerOutputFormat Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of google-cloud Show documentation
Show all versions of google-cloud Show documentation
Plugins for Google Big Query
/*
* Copyright © 2018 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package io.cdap.plugin.gcp.spanner.sink;
import com.google.cloud.spanner.DatabaseClient;
import com.google.cloud.spanner.DatabaseId;
import com.google.cloud.spanner.Mutation;
import com.google.cloud.spanner.Spanner;
import io.cdap.cdap.api.data.schema.Schema;
import io.cdap.plugin.gcp.spanner.SpannerConstants;
import io.cdap.plugin.gcp.spanner.common.BytesCounter;
import io.cdap.plugin.gcp.spanner.common.SpannerUtil;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormatCounter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import javax.annotation.Nullable;
/**
* Spanner output format
*/
public class SpannerOutputFormat extends OutputFormat {
/**
* Get properties from SparkSinkConfig and store them as properties in Configuration
*
* @param configuration the Hadoop configuration to set the properties in
* @param config the spanner configuration
* @param schema schema for spanner table
*/
public static void configure(Configuration configuration, SpannerSinkConfig config, @Nullable Schema schema) {
String projectId = config.connection.getProject();
configuration.set(SpannerConstants.PROJECT_ID, projectId);
String serviceAccount = config.connection.getServiceAccount();
if (serviceAccount != null) {
String type = config.connection.isServiceAccountFilePath() ? SpannerConstants.SERVICE_ACCOUNT_TYPE_FILE_PATH :
SpannerConstants.SERVICE_ACCOUNT_TYPE_JSON;
configuration.set(SpannerConstants.SERVICE_ACCOUNT_TYPE, type);
configuration.set(SpannerConstants.SERVICE_ACCOUNT, serviceAccount);
}
configuration.set(SpannerConstants.INSTANCE_ID, config.getInstance());
configuration.set(SpannerConstants.DATABASE, config.getDatabase());
configuration.set(SpannerConstants.TABLE_NAME, config.getTable());
String keys = config.getKeys();
if (keys != null) {
configuration.set(SpannerConstants.KEYS, config.getKeys());
}
configuration.set(SpannerConstants.SPANNER_WRITE_BATCH_SIZE, String.valueOf(config.getBatchSize()));
if (schema != null) {
configuration.set(SpannerConstants.SCHEMA, schema.toString());
}
}
@Override
public RecordWriter getRecordWriter(TaskAttemptContext context)
throws IOException {
Configuration configuration = context.getConfiguration();
SpannerUtil.verifyPresenceOrCreateDatabaseAndTable(configuration);
String projectId = configuration.get(SpannerConstants.PROJECT_ID);
String instanceId = configuration.get(SpannerConstants.INSTANCE_ID);
String database = configuration.get(SpannerConstants.DATABASE);
String serviceAccountType = configuration.get(SpannerConstants.SERVICE_ACCOUNT_TYPE);
String serviceAccount = configuration.get(SpannerConstants.SERVICE_ACCOUNT);
BytesCounter counter = new BytesCounter();
Spanner spanner = SpannerUtil.getSpannerServiceWithWriteInterceptor(
serviceAccount,
SpannerConstants.SERVICE_ACCOUNT_TYPE_FILE_PATH.equals(serviceAccountType),
projectId,
counter);
int batchSize = Integer.parseInt(configuration.get(SpannerConstants.SPANNER_WRITE_BATCH_SIZE));
DatabaseId db = DatabaseId.of(projectId, instanceId, database);
DatabaseClient client = spanner.getDatabaseClient(db);
return new SpannerRecordWriter(spanner, client, batchSize, counter);
}
/**
* Spanner record writer that buffers mutations and writes to spanner
*/
protected static class SpannerRecordWriter extends RecordWriter {
private final Spanner spanner;
private final DatabaseClient databaseClient;
private final List mutations;
private final int batchSize;
private final BytesCounter counter;
public SpannerRecordWriter(Spanner spanner, DatabaseClient client, int batchSize, BytesCounter counter) {
this.spanner = spanner;
this.databaseClient = client;
this.mutations = new ArrayList<>();
this.batchSize = batchSize;
this.counter = counter;
}
@Override
public void write(NullWritable nullWritable, Mutation mutation) {
mutations.add(mutation);
if (mutations.size() > batchSize) {
databaseClient.write(mutations);
mutations.clear();
}
}
@Override
public void close(TaskAttemptContext taskAttemptContext) {
try {
if (mutations.size() > 0) {
databaseClient.write(mutations);
taskAttemptContext.getCounter(FileOutputFormatCounter.BYTES_WRITTEN).increment(counter.getValue());
mutations.clear();
}
} finally {
spanner.close();
}
}
}
@Override
public void checkOutputSpecs(JobContext jobContext) {
}
/**
* No op output committer
*/
@Override
public OutputCommitter getOutputCommitter(TaskAttemptContext taskAttemptContext) {
return new OutputCommitter() {
@Override
public void setupJob(JobContext jobContext) {
}
@Override
public void setupTask(TaskAttemptContext taskAttemptContext) {
}
@Override
public boolean needsTaskCommit(TaskAttemptContext taskAttemptContext) {
return false;
}
@Override
public void commitTask(TaskAttemptContext taskAttemptContext) {
}
@Override
public void abortTask(TaskAttemptContext taskAttemptContext) {
}
};
}
}