com.spotify.scio.bigtable.BigtableMultiTableWrite Maven / Gradle / Ivy
Show all versions of scio-bigtable_2.10 Show documentation
/*
* Copyright 2016 Spotify AB.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.spotify.scio.bigtable;
import com.google.api.client.util.Lists;
import com.google.bigtable.repackaged.com.google.cloud.config.BulkOptions;
import com.google.cloud.bigtable.dataflow.AbstractCloudBigtableTableDoFn;
import com.google.cloud.bigtable.dataflow.CloudBigtableConfiguration;
import com.google.cloud.bigtable.dataflow.CloudBigtableIO;
import com.google.cloud.dataflow.sdk.transforms.Aggregator;
import com.google.cloud.dataflow.sdk.transforms.DoFn;
import com.google.cloud.dataflow.sdk.transforms.PTransform;
import com.google.cloud.dataflow.sdk.transforms.Sum;
import com.google.cloud.dataflow.sdk.values.KV;
import com.google.cloud.dataflow.sdk.values.PCollection;
import com.google.cloud.dataflow.sdk.values.PDone;
import com.google.common.collect.Maps;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.BufferedMutator;
import org.apache.hadoop.hbase.client.BufferedMutatorParams;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.Delete;
import org.apache.hadoop.hbase.client.Mutation;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.RetriesExhaustedWithDetailsException;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Strings.isNullOrEmpty;
/**
*
* Reimplemented utilities to create {@link com.google.cloud.dataflow.sdk.transforms.PTransform}s for
* writing to multiple Bigtable tables from within a dataflow pipeline.
*
*/
public class BigtableMultiTableWrite {
/**
* A {@link DoFn} that can write either a bounded or unbounded {@link PCollection} of {@link KV}
* of (String tableName, List of {@link Mutation}s) to the specified table. using the
* BufferedMutator.
*/
public static class CloudBigtableMultiTableBufferedWriteFn
extends AbstractCloudBigtableTableDoFn>, Void> {
private static final long serialVersionUID = 2L;
private Map mutators;
// Stats
private final Aggregator mutationsCounter;
private final Aggregator exceptionsCounter;
public CloudBigtableMultiTableBufferedWriteFn(CloudBigtableConfiguration config) {
super(config);
mutationsCounter = createAggregator("mutations", new Sum.SumLongFn());
exceptionsCounter = createAggregator("exceptions", new Sum.SumLongFn());
}
@Override
public void startBundle(Context context) throws Exception {
mutators = Maps.newConcurrentMap();
}
private synchronized BufferedMutator getBufferedMutator(final Context context, final String tableName)
throws IOException {
BufferedMutator mutator = mutators.get(tableName);
if (mutator == null) {
BufferedMutator.ExceptionListener listener = createExceptionListener(context);
BufferedMutatorParams params = new BufferedMutatorParams(TableName.valueOf(tableName))
.writeBufferSize(BulkOptions.BIGTABLE_MAX_MEMORY_DEFAULT).listener(listener);
mutator = getConnection().getBufferedMutator(params);
mutators.put(tableName, mutator);
}
return mutator;
}
protected BufferedMutator.ExceptionListener createExceptionListener(final Context context) {
return new BufferedMutator.ExceptionListener() {
@Override
public void onException(RetriesExhaustedWithDetailsException exception,
BufferedMutator mutator) throws RetriesExhaustedWithDetailsException {
logExceptions(context, exception);
throw exception;
}
};
}
/**
* Performs an asynchronous mutation via {@link BufferedMutator#mutate(Mutation)}.
*/
@Override
public void processElement(ProcessContext context) throws Exception {
KV> element = context.element();
final List mutations = Lists.newArrayList(element.getValue());
getBufferedMutator(context, element.getKey()).mutate(mutations);
mutationsCounter.addValue((long) mutations.size());
}
/**
* Closes the {@link BufferedMutator} and {@link Connection}.
*/
@Override
public void finishBundle(Context context) throws Exception {
try {
for (BufferedMutator mutator : mutators.values()) {
mutator.close();
}
} catch (RetriesExhaustedWithDetailsException exception) {
exceptionsCounter.addValue((long) exception.getCauses().size());
logExceptions(context, exception);
rethrowException(exception);
} finally {
// Close the connection to clean up resources.
super.finishBundle(context);
}
}
}
/**
* Creates a {@link PTransform} that can write either a bounded or unbounded {@link PCollection}
* of {@link KV} of (String tableName, List of {@link Mutation}s) to the specified table.
*
* NOTE: This {@link PTransform} will write {@link Put}s and {@link Delete}s, not {@link
* org.apache.hadoop.hbase.client.Append}s and {@link org.apache.hadoop.hbase.client.Increment}s.
* This limitation exists because if the batch fails partway through, Appends/Increments might be
* re-run, causing the {@link Mutation} to be executed twice, which is never the user's intent.
* Re-running a Delete will not cause any differences. Re-running a Put isn't normally a problem,
* but might cause problems in some cases when the number of versions supported by the column
* family is greater than one. In a case where multiple versions could be a problem, it's best to
* add a timestamp to the {@link Put}.
*/
public static PTransform>>, PDone>
writeToMultipleTables(CloudBigtableConfiguration config) {
validateConfig(config);
return new CloudBigtableIO.CloudBigtableWriteTransform<>(new CloudBigtableMultiTableBufferedWriteFn(config));
}
private static void checkNotNullOrEmpty(String value, String type) {
checkArgument(
!isNullOrEmpty(value), "A " + type + " must be set to configure Bigtable properly.");
}
private static void validateConfig(CloudBigtableConfiguration configuration) {
checkNotNullOrEmpty(configuration.getProjectId(), "projectId");
checkNotNullOrEmpty(configuration.getInstanceId(), "instanceId");
}
}