All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.cloud.bigtable.beam.hbasesnapshots.ImportJobFromHbaseSnapshot Maven / Gradle / Ivy

There is a newer version: 2.14.8
Show newest version
/*
 * Copyright 2021 Google LLC
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.google.cloud.bigtable.beam.hbasesnapshots;

import com.google.bigtable.repackaged.com.google.api.core.InternalExtensionOnly;
import com.google.cloud.bigtable.beam.CloudBigtableIO;
import com.google.cloud.bigtable.beam.TemplateUtils;
import com.google.cloud.bigtable.beam.sequencefiles.HBaseResultToMutationFn;
import com.google.cloud.bigtable.beam.sequencefiles.ImportJob;
import com.google.cloud.bigtable.beam.sequencefiles.Utils;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.MoreObjects;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.beam.runners.dataflow.options.DataflowPipelineOptions;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.PipelineResult;
import org.apache.beam.sdk.io.hadoop.format.HadoopFormatIO;
import org.apache.beam.sdk.options.Default;
import org.apache.beam.sdk.options.Description;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.transforms.Create;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.transforms.Wait;
import org.apache.beam.sdk.util.ReleaseInfo;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.values.PCollection;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;

/**
 * A job that imports data from HBase snapshot exports hosted in Cloud Storage bucket into Cloud
 * Bigtable.
 *
 * 

Example: If you have exported your HBase Snapshot to GCS bucket gs://$HBASE_EXPORT_ROOT_PATH * and want to import snapshot gs://$HBASE_EXPORT_ROOT_PATH/.hbase-snapshot/$SNAPSHOT_NAME into * Cloud Bigtable $TABLE in $INSTANCE, execute the following command to run the job directly: * *

 * mvn compile exec:java \
 *   -DmainClass=com.google.cloud.bigtable.beam.hbasesnapshots.ImportJobFromHbaseSnapshot \
 *   -Dexec.args="--runner=DataflowRunner \
 *                --stagingLocation=gs://$STAGING_PATH \
 *                --project=$PROJECT \
 *                --bigtableInstanceId=$INSTANCE \
 *                --bigtableTableId=$TABLE \
 *                --hbaseSnapshotSourceDir=gs://$HBASE_EXPORT_ROOT_PATH \
 *                --snapshotName=$SNAPSHOT_NAME
 * 
* * Note that in the case of job failures, the temp files generated in the .restore-$JOB_NAME * directory under the snapshot export bucket will not get deleted. Hence one need to either launch * a replacement job with the same jobName to re-run the job or manually delete this directory. */ @InternalExtensionOnly public class ImportJobFromHbaseSnapshot { private static final Log LOG = LogFactory.getLog(ImportJobFromHbaseSnapshot.class); private static final String CONTAINER_IMAGE_PATH_PREFIX = "gcr.io/cloud-bigtable-ecosystem/unified-harness:"; public interface ImportOptions extends ImportJob.ImportOptions { @Description("The HBase root dir where HBase snapshot files resides.") String getHbaseSnapshotSourceDir(); @SuppressWarnings("unused") void setHbaseSnapshotSourceDir(String hbaseSnapshotSourceDir); @Description("Snapshot name") String getSnapshotName(); @SuppressWarnings("unused") void setSnapshotName(String snapshotName); @Description("Is importing Snappy compressed snapshot.") @Default.Boolean(false) Boolean getEnableSnappy(); @SuppressWarnings("unused") void setEnableSnappy(Boolean enableSnappy); } public static void main(String[] args) throws Exception { PipelineOptionsFactory.register(ImportOptions.class); ImportOptions opts = PipelineOptionsFactory.fromArgs(args).withValidation().as(ImportOptions.class); LOG.info("Building Pipeline"); Pipeline pipeline = buildPipeline(opts); LOG.info("Running Pipeline"); PipelineResult result = pipeline.run(); if (opts.getWait()) { Utils.waitForPipelineToFinish(result); } } @VisibleForTesting static Pipeline buildPipeline(ImportOptions opts) throws Exception { if (opts.getEnableSnappy()) { DataflowPipelineOptions dataFlowOpts = opts.as(DataflowPipelineOptions.class); dataFlowOpts.setSdkContainerImage( CONTAINER_IMAGE_PATH_PREFIX + ReleaseInfo.getReleaseInfo().getVersion()); List expOpts = MoreObjects.firstNonNull(dataFlowOpts.getExperiments(), new ArrayList()); if (!expOpts.contains("use_runner_v2")) { expOpts = new ArrayList<>(expOpts); expOpts.add("use_runner_v2"); } dataFlowOpts.setExperiments(expOpts); } Pipeline pipeline = Pipeline.create(Utils.tweakOptions(opts)); HBaseSnapshotInputConfigBuilder configurationBuilder = new HBaseSnapshotInputConfigBuilder() .setProjectId(opts.getProject()) .setHbaseSnapshotSourceDir(opts.getHbaseSnapshotSourceDir()) .setSnapshotName(opts.getSnapshotName()) .setRestoreDirSuffix(opts.getJobName()); PCollection> readResult = pipeline.apply( "Read from HBase Snapshot", HadoopFormatIO.read() .withConfiguration(configurationBuilder.build())); readResult .apply("Create Mutations", ParDo.of(new HBaseResultToMutationFn())) .apply( "Write to Bigtable", CloudBigtableIO.writeToTable( TemplateUtils.buildImportConfig(opts, "HBaseSnapshotImportJob"))); final List> sourceAndRestoreFolders = Arrays.asList( KV.of(opts.getHbaseSnapshotSourceDir(), configurationBuilder.getRestoreDir())); pipeline .apply(Create.of(sourceAndRestoreFolders)) .apply(Wait.on(readResult)) .apply(ParDo.of(new CleanupHBaseSnapshotRestoreFilesFn())); return pipeline; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy