com.google.cloud.bigtable.beam.sequencefiles.ImportJob Maven / Gradle / Ivy
/*
* Copyright 2017 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.cloud.bigtable.beam.sequencefiles;
import com.google.bigtable.repackaged.com.google.api.core.InternalExtensionOnly;
import com.google.cloud.bigtable.beam.CloudBigtableIO;
import com.google.cloud.bigtable.beam.CloudBigtableTableConfiguration;
import com.google.cloud.bigtable.beam.TemplateUtils;
import com.google.cloud.bigtable.thirdparty.org.apache.hadoop.hbase.mapreduce.ResultSerialization;
import com.google.common.annotations.VisibleForTesting;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.PipelineResult;
import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
import org.apache.beam.sdk.io.Read;
import org.apache.beam.sdk.options.Default;
import org.apache.beam.sdk.options.Description;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.options.ValueProvider;
import org.apache.beam.sdk.transforms.PTransform;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.PDone;
import org.apache.hadoop.hbase.client.Mutation;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.io.serializer.WritableSerialization;
/**
* A job that imports data from Cloud Storage bucket with HBase SequenceFile format into Cloud
* Bigtable. This job can be run directly or as a Dataflow template.
*
* Execute the following command to run the job directly:
*
*
* mvn compile exec:java \
* -DmainClass=com.google.cloud.bigtable.beam.sequencefiles.ImportJob \
* -Dexec.args="--runner=DataflowRunner \
* --stagingLocation=gs://$STAGING_PATH \
* --project=$PROJECT \
* --bigtableInstanceId=$INSTANCE \
* --bigtableTableId=$TABLE \
* --sourcePattern=gs://$SOURCE_PATTERN"
*
*
* Execute the following command to create the Dataflow template:
*
*
* mvn compile exec:java \
* -DmainClass=com.google.cloud.bigtable.beam.sequencefiles.ImportJob \
* -Dexec.args="--runner=DataflowRunner \
* --project=$PROJECT \
* --stagingLocation=gs://$STAGING_PATH \
* --templateLocation=gs://$TEMPLATE_PATH \
* --wait=false"
*
*
* There are a few ways to run the pipeline using the template. See Dataflow doc for details:
* https://cloud.google.com/dataflow/docs/templates/executing-templates. Optionally, you can upload
* a metadata file that contains information about the runtime parameters that can be used for
* parameter validation purpose and more. A sample metadata file can be found at
* "src/main/resources/ImportJob_metadata".
*
*
An example using gcloud command line:
*
*
* gcloud beta dataflow jobs run $JOB_NAME \
* --gcs-location gs://$TEMPLATE_PATH \
* --parameters bigtableProject=$PROJECT,bigtableInstanceId=$INSTANCE,bigtableTableId=$TABLE,sourcePattern=gs://$SOURCE_PATTERN
*
*/
@InternalExtensionOnly
public class ImportJob {
static final long BUNDLE_SIZE = 100 * 1024 * 1024;
public interface ImportOptions extends GcpOptions {
@Description("This Bigtable App Profile id.")
ValueProvider getBigtableAppProfileId();
@SuppressWarnings("unused")
void setBigtableAppProfileId(ValueProvider appProfileId);
@Description("The project that contains the table to export. Defaults to --project.")
@Default.InstanceFactory(Utils.DefaultBigtableProjectFactory.class)
ValueProvider getBigtableProject();
@SuppressWarnings("unused")
void setBigtableProject(ValueProvider projectId);
@Description("The Bigtable instance id that contains the table to export.")
ValueProvider getBigtableInstanceId();
@SuppressWarnings("unused")
void setBigtableInstanceId(ValueProvider instanceId);
@Description("The Bigtable table id to export.")
ValueProvider getBigtableTableId();
@SuppressWarnings("unused")
void setBigtableTableId(ValueProvider tableId);
@Description(
"The fully qualified file pattern to import. Should of the form '[destinationPath]/part-*'")
ValueProvider getSourcePattern();
@SuppressWarnings("unused")
void setSourcePattern(ValueProvider sourcePath);
@Description(
"Optional Set mutation latency throttling (enables the feature). Value in milliseconds.")
@Default.Integer(0)
ValueProvider getMutationThrottleLatencyMs();
@SuppressWarnings("unused")
void setMutationThrottleLatencyMs(ValueProvider throttleMs);
// When creating a template, this flag must be set to false.
@Description("Wait for pipeline to finish.")
@Default.Boolean(true)
boolean getWait();
@SuppressWarnings("unused")
void setWait(boolean wait);
}
public static void main(String[] args) {
PipelineOptionsFactory.register(ImportOptions.class);
ImportOptions opts =
PipelineOptionsFactory.fromArgs(args).withValidation().as(ImportOptions.class);
Pipeline pipeline = buildPipeline(opts);
PipelineResult result = pipeline.run();
if (opts.getWait()) {
Utils.waitForPipelineToFinish(result);
}
}
@VisibleForTesting
static Pipeline buildPipeline(ImportOptions opts) {
Pipeline pipeline = Pipeline.create(Utils.tweakOptions(opts));
pipeline
.apply(
"Read Sequence File",
Read.from(new ShuffledSource<>(createSource(opts.getSourcePattern()))))
.apply("Create Mutations", ParDo.of(new HBaseResultToMutationFn()))
.apply("Write to Bigtable", createSink(opts));
return pipeline;
}
static SequenceFileSource createSource(
ValueProvider sourcePattern) {
return new SequenceFileSource<>(
sourcePattern,
ImmutableBytesWritable.class,
WritableSerialization.class,
Result.class,
ResultSerialization.class,
BUNDLE_SIZE);
}
static PTransform, PDone> createSink(ImportOptions opts) {
CloudBigtableTableConfiguration config =
TemplateUtils.buildImportConfig(opts, "SequenceFileImportJob");
return CloudBigtableIO.writeToTable(config);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy