All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.cloud.bigtable.beam.sequencefiles.ExportJob Maven / Gradle / Ivy

There is a newer version: 2.14.7
Show newest version
/*
 * Copyright 2017 Google LLC
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.google.cloud.bigtable.beam.sequencefiles;

import com.google.bigtable.repackaged.com.google.api.core.InternalExtensionOnly;
import com.google.cloud.bigtable.beam.CloudBigtableIO;
import com.google.cloud.bigtable.beam.CloudBigtableScanConfiguration;
import com.google.cloud.bigtable.beam.TemplateUtils;
import com.google.cloud.bigtable.thirdparty.org.apache.hadoop.hbase.mapreduce.ResultSerialization;
import java.io.Serializable;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.PipelineResult;
import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
import org.apache.beam.sdk.io.DefaultFilenamePolicy;
import org.apache.beam.sdk.io.FileSystems;
import org.apache.beam.sdk.io.Read;
import org.apache.beam.sdk.io.WriteFiles;
import org.apache.beam.sdk.io.fs.ResolveOptions.StandardResolveOptions;
import org.apache.beam.sdk.io.fs.ResourceId;
import org.apache.beam.sdk.options.Default;
import org.apache.beam.sdk.options.Description;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.options.ValueProvider;
import org.apache.beam.sdk.options.ValueProvider.NestedValueProvider;
import org.apache.beam.sdk.transforms.MapElements;
import org.apache.beam.sdk.transforms.SerializableFunction;
import org.apache.beam.sdk.transforms.SimpleFunction;
import org.apache.beam.sdk.values.KV;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.io.serializer.WritableSerialization;

/**
 * Beam job to export a Bigtable table to a set of SequenceFiles. Afterwards, the files can be
 * either imported into another Bigtable or HBase table. You can limit the rows and columns exported
 * using the options in {@link ExportOptions}. Please note that the rows in SequenceFiles will not
 * be sorted.
 *
 * 

Furthermore, you can export a subset of the data using a combination of --bigtableStartRow, * --bigtableStopRow and --bigtableFilter. * *

Execute the following command to run the job directly: * *

 * {@code mvn compile exec:java \
 *    -Dexec.mainClass=com.google.cloud.bigtable.beam.sequencefiles.ExportJob \
 *    -Dexec.args="--runner=dataflow \
 *    --project=[PROJECT_ID] \
 *    --tempLocation=gs://[BUCKET]/[TEMP_PATH] \
 *    --bigtableInstanceId=[INSTANCE] \
 *    --bigtableTableId=[TABLE] \
 *    --destination=gs://[BUCKET]/[EXPORT_PATH] \
 *    --maxNumWorkers=[nodes * 10]"
 * }
 * 
* *

Execute the following command to create the Dataflow template: * *

 * mvn compile exec:java \
 *   -DmainClass=com.google.cloud.bigtable.beam.sequencefiles.ExportJob \
 *   -Dexec.args="--runner=DataflowRunner \
 *                --project=[PROJECT_ID] \
 *                --stagingLocation=gs://[STAGING_PATH] \
 *                --templateLocation=gs://[TEMPLATE_PATH] \
 *                --wait=false"
 * 
* *

There are a few ways to run the pipeline using the template. See Dataflow doc for details: * https://cloud.google.com/dataflow/docs/templates/executing-templates. Optionally, you can upload * a metadata file that contains information about the runtime parameters that can be used for * parameter validation purpose and more. A sample metadata file can be found at * "src/main/resources/ExportJob_metadata". * *

An example using gcloud command line: * *

 * gcloud beta dataflow jobs run [JOB_NAME] \
 *   --gcs-location gs://[TEMPLATE_PATH] \
 *   --parameters bigtableProject=[PROJECT_ID],bigtableInstanceId=[INSTANCE],bigtableTableId=[TABLE],destinationPath=gs://[DESTINATION_PATH],filenamePrefix=[FILENAME_PREFIX]
 * 
*/ @InternalExtensionOnly public class ExportJob { public interface ExportOptions extends GcpOptions { @Description("This Bigtable App Profile id.") ValueProvider getBigtableAppProfileId(); @SuppressWarnings("unused") void setBigtableAppProfileId(ValueProvider appProfileId); @Description("The project that contains the table to export. Defaults to --project.") @Default.InstanceFactory(Utils.DefaultBigtableProjectFactory.class) ValueProvider getBigtableProject(); @SuppressWarnings("unused") void setBigtableProject(ValueProvider projectId); @Description("The Bigtable instance id that contains the table to export.") ValueProvider getBigtableInstanceId(); @SuppressWarnings("unused") void setBigtableInstanceId(ValueProvider instanceId); @Description("The Bigtable table id to export.") ValueProvider getBigtableTableId(); @SuppressWarnings("unused") void setBigtableTableId(ValueProvider tableId); @Description("The row where to start the export from, defaults to the first row.") @Default.String("") ValueProvider getBigtableStartRow(); @SuppressWarnings("unused") void setBigtableStartRow(ValueProvider startRow); @Description("The row where to stop the export, defaults to last row.") @Default.String("") ValueProvider getBigtableStopRow(); @SuppressWarnings("unused") void setBigtableStopRow(ValueProvider stopRow); @Description("Maximum number of cell versions.") @Default.Integer(Integer.MAX_VALUE) ValueProvider getBigtableMaxVersions(); @SuppressWarnings("unused") void setBigtableMaxVersions(ValueProvider maxVersions); @Description("Filter string. See: http://hbase.apache.org/book.html#thrift.") @Default.String("") ValueProvider getBigtableFilter(); @SuppressWarnings("unused") void setBigtableFilter(ValueProvider filter); @Description("The destination directory") ValueProvider getDestinationPath(); @SuppressWarnings("unused") void setDestinationPath(ValueProvider destinationPath); @Description("The prefix for each shard in destinationPath") @Default.String("part") ValueProvider getFilenamePrefix(); @SuppressWarnings("unused") void setFilenamePrefix(ValueProvider filenamePrefix); @Description("Wait for pipeline to finish.") @Default.Boolean(true) boolean getWait(); @SuppressWarnings("unused") void setWait(boolean wait); } public static void main(String[] args) { PipelineOptionsFactory.register(ExportOptions.class); ExportOptions opts = PipelineOptionsFactory.fromArgs(args).withValidation().as(ExportOptions.class); Pipeline pipeline = buildPipeline(opts); PipelineResult result = pipeline.run(); if (opts.getWait()) { Utils.waitForPipelineToFinish(result); } } static Pipeline buildPipeline(ExportOptions opts) { // Use the base target directory to stage bundles ValueProvider destinationPath = NestedValueProvider.of(opts.getDestinationPath(), new StringToDirResourceId()); // Concat the destination path & prefix for the final path FilePathPrefix filePathPrefix = new FilePathPrefix(destinationPath, opts.getFilenamePrefix()); SequenceFileSink sink = new SequenceFileSink<>( destinationPath, DefaultFilenamePolicy.fromStandardParameters(filePathPrefix, null, "", false), ImmutableBytesWritable.class, WritableSerialization.class, Result.class, ResultSerialization.class); Pipeline pipeline = Pipeline.create(Utils.tweakOptions(opts)); CloudBigtableScanConfiguration config = TemplateUtils.buildExportConfig(opts); pipeline .apply("Read table", Read.from(CloudBigtableIO.read(config))) .apply("Format results", MapElements.via(new ResultToKV())) .apply("Write", WriteFiles.to(sink)); return pipeline; } static class ResultToKV extends SimpleFunction> { @Override public KV apply(Result input) { return KV.of(new ImmutableBytesWritable(input.getRow()), input); } } static class StringToDirResourceId implements SerializableFunction, Serializable { @Override public ResourceId apply(String input) { return FileSystems.matchNewResource(input, true); } } static class FilePathPrefix implements ValueProvider, Serializable { private final ValueProvider destinationPath; private final ValueProvider filenamePrefix; FilePathPrefix( ValueProvider destinationPath, ValueProvider filenamePrefix) { this.destinationPath = destinationPath; this.filenamePrefix = filenamePrefix; } @Override public ResourceId get() { return destinationPath .get() .resolve(filenamePrefix.get(), StandardResolveOptions.RESOLVE_FILE); } @Override public boolean isAccessible() { return destinationPath.isAccessible() && filenamePrefix.isAccessible(); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy