com.google.cloud.bigtable.beam.sequencefiles.ExportJob Maven / Gradle / Ivy
/*
* Copyright 2017 Google Inc. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.cloud.bigtable.beam.sequencefiles;
import com.google.cloud.bigtable.beam.CloudBigtableIO;
import com.google.cloud.bigtable.beam.CloudBigtableScanConfiguration;
import com.google.cloud.bigtable.beam.TemplateUtils;
import java.io.Serializable;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.PipelineResult;
import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
import org.apache.beam.sdk.io.DefaultFilenamePolicy;
import org.apache.beam.sdk.io.FileSystems;
import org.apache.beam.sdk.io.Read;
import org.apache.beam.sdk.io.WriteFiles;
import org.apache.beam.sdk.io.fs.ResolveOptions.StandardResolveOptions;
import org.apache.beam.sdk.io.fs.ResourceId;
import org.apache.beam.sdk.options.Default;
import org.apache.beam.sdk.options.Description;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.options.ValueProvider;
import org.apache.beam.sdk.options.ValueProvider.NestedValueProvider;
import org.apache.beam.sdk.transforms.MapElements;
import org.apache.beam.sdk.transforms.SerializableFunction;
import org.apache.beam.sdk.transforms.SimpleFunction;
import org.apache.beam.sdk.values.KV;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.io.serializer.WritableSerialization;
/**
*
* Beam job to export a Bigtable table to a set of SequenceFiles.
* Afterwards, the files can be either imported into another Bigtable or HBase table.
* You can limit the rows and columns exported using the options in {@link ExportOptions}.
* Please note that the rows in SequenceFiles will not be sorted.
*
* Furthermore, you can export a subset of the data using a combination of --bigtableStartRow,
* --bigtableStopRow and --bigtableFilter.
*
*
Execute the following command to run the job directly:
*
*
* {@code mvn compile exec:java \
* -Dexec.mainClass=com.google.cloud.bigtable.beam.sequencefiles.ExportJob \
* -Dexec.args="--runner=dataflow \
* --project=[PROJECT_ID] \
* --tempLocation=gs://[BUCKET]/[TEMP_PATH] \
* --zone=[ZONE] \
* --bigtableInstanceId=[INSTANCE] \
* --bigtableTableId=[TABLE] \
* --destination=gs://[BUCLET]/[EXPORT_PATH] \
* --maxNumWorkers=[nodes * 10]"
* }
*
*
* Execute the following command to create the Dataflow template:
*
*
* mvn compile exec:java \
* -DmainClass=com.google.cloud.bigtable.beam.sequencefiles.ExportJob \
* -Dexec.args="--runner=DataflowRunner \
* --project=[PROJECT_ID] \
* --stagingLocation=gs://[STAGING_PATH] \
* --templateLocation=gs://[TEMPLATE_PATH] \
* --wait=false"
*
*
* There are a few ways to run the pipeline using the template. See Dataflow doc for details:
* https://cloud.google.com/dataflow/docs/templates/executing-templates. Optionally, you can upload
* a metadata file that contains information about the runtime parameters that can be used for
* parameter validation purpose and more. A sample metadata file can be found at
* "src/main/resources/ExportJob_metadata".
*
*
An example using gcloud command line:
*
*
* gcloud beta dataflow jobs run [JOB_NAME] \
* --gcs-location gs://[TEMPLATE_PATH] \
* --parameters bigtableProject=[PROJECT_ID],bigtableInstanceId=[INSTANCE],bigtableTableId=[TABLE],destinationPath=gs://[DESTINATION_PATH],filenamePrefix=[FILENAME_PREFIX]
*
*
* @author igorbernstein2
*/
public class ExportJob {
public interface ExportOptions extends GcpOptions {
@Description("This Bigtable App Profile id. (Replication alpha feature).")
ValueProvider getBigtableAppProfileId();
@SuppressWarnings("unused")
void setBigtableAppProfileId(ValueProvider appProfileId);
@Description("The project that contains the table to export. Defaults to --project.")
@Default.InstanceFactory(Utils.DefaultBigtableProjectFactory.class)
ValueProvider getBigtableProject();
@SuppressWarnings("unused")
void setBigtableProject(ValueProvider projectId);
@Description("The Bigtable instance id that contains the table to export.")
ValueProvider getBigtableInstanceId();
@SuppressWarnings("unused")
void setBigtableInstanceId(ValueProvider instanceId);
@Description("The Bigtable table id to export.")
ValueProvider getBigtableTableId();
@SuppressWarnings("unused")
void setBigtableTableId(ValueProvider tableId);
@Description("The row where to start the export from, defaults to the first row.")
@Default.String("")
ValueProvider getBigtableStartRow();
@SuppressWarnings("unused")
void setBigtableStartRow(ValueProvider startRow);
@Description("The row where to stop the export, defaults to last row.")
@Default.String("")
ValueProvider getBigtableStopRow();
@SuppressWarnings("unused")
void setBigtableStopRow(ValueProvider stopRow);
@Description("Maximum number of cell versions.")
@Default.Integer(Integer.MAX_VALUE)
ValueProvider getBigtableMaxVersions();
@SuppressWarnings("unused")
void setBigtableMaxVersions(ValueProvider maxVersions);
@Description("Filter string. See: http://hbase.apache.org/book.html#thrift.")
@Default.String("")
ValueProvider getBigtableFilter();
@SuppressWarnings("unused")
void setBigtableFilter(ValueProvider filter);
@Description("The destination directory")
ValueProvider getDestinationPath();
@SuppressWarnings("unused")
void setDestinationPath(ValueProvider destinationPath);
@Description("The prefix for each shard in destinationPath")
@Default.String("part")
ValueProvider getFilenamePrefix();
@SuppressWarnings("unused")
void setFilenamePrefix(ValueProvider filenamePrefix);
@Description("Wait for pipeline to finish.")
@Default.Boolean(true)
boolean getWait();
@SuppressWarnings("unused")
void setWait(boolean wait);
}
public static void main(String[] args) {
PipelineOptionsFactory.register(ExportOptions.class);
ExportOptions opts = PipelineOptionsFactory
.fromArgs(args).withValidation()
.as(ExportOptions.class);
Pipeline pipeline = buildPipeline(opts);
PipelineResult result = pipeline.run();
if (opts.getWait()) {
Utils.waitForPipelineToFinish(result);
}
}
static Pipeline buildPipeline(ExportOptions opts) {
// Use the base target directory to stage bundles
ValueProvider destinationPath = NestedValueProvider
.of(opts.getDestinationPath(), new StringToDirResourceId());
// Concat the destination path & prefix for the final path
FilePathPrefix filePathPrefix = new FilePathPrefix(destinationPath, opts.getFilenamePrefix());
SequenceFileSink sink = new SequenceFileSink<>(
destinationPath,
DefaultFilenamePolicy.fromStandardParameters(
filePathPrefix,
null,
"",
false
),
ImmutableBytesWritable.class, WritableSerialization.class,
Result.class, ResultSerialization.class
);
Pipeline pipeline = Pipeline.create(Utils.tweakOptions(opts));
CloudBigtableScanConfiguration config = TemplateUtils.BuildExportConfig(opts);
pipeline
.apply("Read table", Read.from(CloudBigtableIO.read(config)))
.apply("Format results", MapElements.via(new ResultToKV()))
.apply("Write", WriteFiles.to(sink));
return pipeline;
}
static class ResultToKV extends SimpleFunction> {
@Override
public KV apply(Result input) {
return KV.of(new ImmutableBytesWritable(input.getRow()), input);
}
}
static class StringToDirResourceId implements SerializableFunction, Serializable {
@Override
public ResourceId apply(String input) {
return FileSystems.matchNewResource(input, true);
}
}
static class FilePathPrefix implements ValueProvider, Serializable {
private final ValueProvider destinationPath;
private final ValueProvider filenamePrefix;
FilePathPrefix(ValueProvider destinationPath,
ValueProvider filenamePrefix) {
this.destinationPath = destinationPath;
this.filenamePrefix = filenamePrefix;
}
@Override
public ResourceId get() {
return destinationPath.get().resolve(filenamePrefix.get(), StandardResolveOptions.RESOLVE_FILE);
}
@Override
public boolean isAccessible() {
return destinationPath.isAccessible() && filenamePrefix.isAccessible();
}
}
}