com.google.cloud.bigtable.beam.sequencefiles.ExportJob Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of bigtable-beam-import Show documentation
There is a newer version: 2.14.7
/*
 * Copyright 2017 Google LLC
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.google.cloud.bigtable.beam.sequencefiles;

import com.google.bigtable.repackaged.com.google.api.core.InternalExtensionOnly;
import com.google.cloud.bigtable.beam.CloudBigtableIO;
import com.google.cloud.bigtable.beam.CloudBigtableScanConfiguration;
import com.google.cloud.bigtable.beam.TemplateUtils;
import com.google.cloud.bigtable.thirdparty.org.apache.hadoop.hbase.mapreduce.ResultSerialization;
import java.io.Serializable;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.PipelineResult;
import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
import org.apache.beam.sdk.io.DefaultFilenamePolicy;
import org.apache.beam.sdk.io.FileSystems;
import org.apache.beam.sdk.io.Read;
import org.apache.beam.sdk.io.WriteFiles;
import org.apache.beam.sdk.io.fs.ResolveOptions.StandardResolveOptions;
import org.apache.beam.sdk.io.fs.ResourceId;
import org.apache.beam.sdk.options.Default;
import org.apache.beam.sdk.options.Description;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.options.ValueProvider;
import org.apache.beam.sdk.options.ValueProvider.NestedValueProvider;
import org.apache.beam.sdk.transforms.MapElements;
import org.apache.beam.sdk.transforms.SerializableFunction;
import org.apache.beam.sdk.transforms.SimpleFunction;
import org.apache.beam.sdk.values.KV;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.io.serializer.WritableSerialization;

/**
 * Beam job to export a Bigtable table to a set of SequenceFiles. Afterwards, the files can be
 * either imported into another Bigtable or HBase table. You can limit the rows and columns exported
 * using the options in {@link ExportOptions}. Please note that the rows in SequenceFiles will not
 * be sorted.
 *
 * Furthermore, you can export a subset of the data using a combination of --bigtableStartRow,
 * --bigtableStopRow and --bigtableFilter.
 *
 * 
Execute the following command to run the job directly:
 *
 * 
 * {@code mvn compile exec:java \
 *    -Dexec.mainClass=com.google.cloud.bigtable.beam.sequencefiles.ExportJob \
 *    -Dexec.args="--runner=dataflow \
 *    --project=[PROJECT_ID] \
 *    --tempLocation=gs://[BUCKET]/[TEMP_PATH] \
 *    --bigtableInstanceId=[INSTANCE] \
 *    --bigtableTableId=[TABLE] \
 *    --destination=gs://[BUCKET]/[EXPORT_PATH] \
 *    --maxNumWorkers=[nodes * 10]"
 * }
 * 
 *
 * Execute the following command to create the Dataflow template:
 *
 * 
 * mvn compile exec:java \
 *   -DmainClass=com.google.cloud.bigtable.beam.sequencefiles.ExportJob \
 *   -Dexec.args="--runner=DataflowRunner \
 *                --project=[PROJECT_ID] \
 *                --stagingLocation=gs://[STAGING_PATH] \
 *                --templateLocation=gs://[TEMPLATE_PATH] \
 *                --wait=false"
 * 
 *
 * There are a few ways to run the pipeline using the template. See Dataflow doc for details:
 * https://cloud.google.com/dataflow/docs/templates/executing-templates. Optionally, you can upload
 * a metadata file that contains information about the runtime parameters that can be used for
 * parameter validation purpose and more. A sample metadata file can be found at
 * "src/main/resources/ExportJob_metadata".
 *
 * 
An example using gcloud command line:
 *
 * 
 * gcloud beta dataflow jobs run [JOB_NAME] \
 *   --gcs-location gs://[TEMPLATE_PATH] \
 *   --parameters bigtableProject=[PROJECT_ID],bigtableInstanceId=[INSTANCE],bigtableTableId=[TABLE],destinationPath=gs://[DESTINATION_PATH],filenamePrefix=[FILENAME_PREFIX]
 * 
 */
@InternalExtensionOnly
public class ExportJob {
  public interface ExportOptions extends GcpOptions {
    @Description("This Bigtable App Profile id.")
    ValueProvider getBigtableAppProfileId();

    @SuppressWarnings("unused")
    void setBigtableAppProfileId(ValueProvider appProfileId);

    @Description("The project that contains the table to export. Defaults to --project.")
    @Default.InstanceFactory(Utils.DefaultBigtableProjectFactory.class)
    ValueProvider getBigtableProject();

    @SuppressWarnings("unused")
    void setBigtableProject(ValueProvider projectId);

    @Description("The Bigtable instance id that contains the table to export.")
    ValueProvider getBigtableInstanceId();

    @SuppressWarnings("unused")
    void setBigtableInstanceId(ValueProvider instanceId);

    @Description("The Bigtable table id to export.")
    ValueProvider getBigtableTableId();

    @SuppressWarnings("unused")
    void setBigtableTableId(ValueProvider tableId);

    @Description("The row where to start the export from, defaults to the first row.")
    @Default.String("")
    ValueProvider getBigtableStartRow();

    @SuppressWarnings("unused")
    void setBigtableStartRow(ValueProvider startRow);

    @Description("The row where to stop the export, defaults to last row.")
    @Default.String("")
    ValueProvider getBigtableStopRow();

    @SuppressWarnings("unused")
    void setBigtableStopRow(ValueProvider stopRow);

    @Description("Maximum number of cell versions.")
    @Default.Integer(Integer.MAX_VALUE)
    ValueProvider getBigtableMaxVersions();

    @SuppressWarnings("unused")
    void setBigtableMaxVersions(ValueProvider maxVersions);

    @Description("Filter string. See: http://hbase.apache.org/book.html#thrift.")
    @Default.String("")
    ValueProvider getBigtableFilter();

    @SuppressWarnings("unused")
    void setBigtableFilter(ValueProvider filter);

    @Description("The destination directory")
    ValueProvider getDestinationPath();

    @SuppressWarnings("unused")
    void setDestinationPath(ValueProvider destinationPath);

    @Description("The prefix for each shard in destinationPath")
    @Default.String("part")
    ValueProvider getFilenamePrefix();

    @SuppressWarnings("unused")
    void setFilenamePrefix(ValueProvider filenamePrefix);

    @Description("Wait for pipeline to finish.")
    @Default.Boolean(true)
    boolean getWait();

    @SuppressWarnings("unused")
    void setWait(boolean wait);
  }

  public static void main(String[] args) {
    PipelineOptionsFactory.register(ExportOptions.class);

    ExportOptions opts =
        PipelineOptionsFactory.fromArgs(args).withValidation().as(ExportOptions.class);

    Pipeline pipeline = buildPipeline(opts);

    PipelineResult result = pipeline.run();

    if (opts.getWait()) {
      Utils.waitForPipelineToFinish(result);
    }
  }

  static Pipeline buildPipeline(ExportOptions opts) {
    // Use the base target directory to stage bundles
    ValueProvider destinationPath =
        NestedValueProvider.of(opts.getDestinationPath(), new StringToDirResourceId());

    // Concat the destination path & prefix for the final path
    FilePathPrefix filePathPrefix = new FilePathPrefix(destinationPath, opts.getFilenamePrefix());

    SequenceFileSink sink =
        new SequenceFileSink<>(
            destinationPath,
            DefaultFilenamePolicy.fromStandardParameters(filePathPrefix, null, "", false),
            ImmutableBytesWritable.class,
            WritableSerialization.class,
            Result.class,
            ResultSerialization.class);

    Pipeline pipeline = Pipeline.create(Utils.tweakOptions(opts));

    CloudBigtableScanConfiguration config = TemplateUtils.buildExportConfig(opts);
    pipeline
        .apply("Read table", Read.from(CloudBigtableIO.read(config)))
        .apply("Format results", MapElements.via(new ResultToKV()))
        .apply("Write", WriteFiles.to(sink));

    return pipeline;
  }

  static class ResultToKV extends SimpleFunction> {
    @Override
    public KV apply(Result input) {
      return KV.of(new ImmutableBytesWritable(input.getRow()), input);
    }
  }

  static class StringToDirResourceId
      implements SerializableFunction, Serializable {
    @Override
    public ResourceId apply(String input) {
      return FileSystems.matchNewResource(input, true);
    }
  }

  static class FilePathPrefix implements ValueProvider, Serializable {
    private final ValueProvider destinationPath;
    private final ValueProvider filenamePrefix;

    FilePathPrefix(
        ValueProvider destinationPath, ValueProvider filenamePrefix) {
      this.destinationPath = destinationPath;
      this.filenamePrefix = filenamePrefix;
    }

    @Override
    public ResourceId get() {
      return destinationPath
          .get()
          .resolve(filenamePrefix.get(), StandardResolveOptions.RESOLVE_FILE);
    }

    @Override
    public boolean isAccessible() {
      return destinationPath.isAccessible() && filenamePrefix.isAccessible();
    }
  }
}