All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.utilities.HiveIncrementalPuller Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.utilities;

import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.util.FileIOUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.hadoop.fs.HadoopFSUtils;
import org.apache.hudi.utilities.exception.HoodieIncrementalPullException;
import org.apache.hudi.utilities.exception.HoodieIncrementalPullSQLException;

import com.beust.jcommander.JCommander;
import com.beust.jcommander.Parameter;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsAction;
import org.apache.hadoop.fs.permission.FsPermission;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.stringtemplate.v4.ST;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.Serializable;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.List;
import java.util.Scanner;
import java.util.stream.Collectors;

/**
 * Utility to pull data after a given commit, based on the supplied HiveQL and save the delta as another hive temporary
 * table. This temporary table can be further read using {@link org.apache.hudi.utilities.sources.HiveIncrPullSource} and the changes can
 * be applied to the target table.
 * 

* Current Limitations: *

* - Only the source table can be incrementally pulled (usually the largest table) - The incrementally pulled table * can't be referenced more than once. */ public class HiveIncrementalPuller { private static final Logger LOG = LoggerFactory.getLogger(HiveIncrementalPuller.class); public static class Config implements Serializable { @Parameter(names = {"--hiveUrl"}) public String hiveJDBCUrl = "jdbc:hive2://localhost:10014/;transportMode=http;httpPath=hs2"; @Parameter(names = {"--hiveUser"}) public String hiveUsername = "hive"; @Parameter(names = {"--hivePass"}) public String hivePassword = ""; @Parameter(names = {"--queue"}) public String yarnQueueName = "hadoop-queue"; @Parameter(names = {"--tmp"}) public String hoodieTmpDir = "/app/hoodie/intermediate"; @Parameter(names = {"--extractSQLFile"}, required = true) public String incrementalSQLFile; @Parameter(names = {"--sourceDb"}, required = true) public String sourceDb; @Parameter(names = {"--sourceTable"}, required = true) public String sourceTable; @Parameter(names = {"--targetDb"}) public String targetDb; @Parameter(names = {"--targetTable"}, required = true) public String targetTable; @Parameter(names = {"--tmpdb"}) public String tmpDb = "tmp"; @Parameter(names = {"--fromCommitTime"}) public String fromCommitTime; @Parameter(names = {"--maxCommits"}) public int maxCommits = 3; @Parameter(names = {"--fsDefaultFs"}) public String fsDefaultFs = "file:///"; @Parameter(names = {"--help", "-h"}, help = true) public Boolean help = false; } static { String driverName = "org.apache.hive.jdbc.HiveDriver"; try { Class.forName(driverName); } catch (ClassNotFoundException e) { throw new IllegalStateException("Could not find " + driverName + " in classpath. ", e); } } private Connection connection; protected final Config config; private final ST incrementalPullSQLTemplate; public HiveIncrementalPuller(Config config) throws IOException { this.config = config; validateConfig(config); String templateContent = FileIOUtils.readAsUTFString(this.getClass().getResourceAsStream("/IncrementalPull.sqltemplate")); incrementalPullSQLTemplate = new ST(templateContent); } private void validateConfig(Config config) { if (config.maxCommits == -1) { config.maxCommits = Integer.MAX_VALUE; } } public void saveDelta() throws IOException { Configuration conf = new Configuration(); conf.set("fs.defaultFS",config.fsDefaultFs); FileSystem fs = FileSystem.get(conf); Statement stmt = null; try { if (config.fromCommitTime == null) { config.fromCommitTime = inferCommitTime(fs); LOG.info("FromCommitTime inferred as " + config.fromCommitTime); } LOG.info("FromCommitTime - " + config.fromCommitTime); String sourceTableLocation = getTableLocation(config.sourceDb, config.sourceTable); String lastCommitTime = getLastCommitTimePulled(fs, sourceTableLocation); if (lastCommitTime == null) { LOG.info("Nothing to pull. However we will continue to create a empty table"); lastCommitTime = config.fromCommitTime; } Connection conn = getConnection(); stmt = conn.createStatement(); // drop the temp table if exists String tempDbTable = config.tmpDb + "." + config.targetTable + "__" + config.sourceTable; String tempDbTablePath = config.hoodieTmpDir + "/" + config.targetTable + "__" + config.sourceTable + "/" + lastCommitTime; executeStatement("drop table if exists " + tempDbTable, stmt); deleteHDFSPath(fs, tempDbTablePath); if (!ensureTempPathExists(fs, lastCommitTime)) { throw new IllegalStateException("Could not create target path at " + new Path(config.hoodieTmpDir, config.targetTable + "/" + lastCommitTime)); } initHiveBeelineProperties(stmt); executeIncrementalSQL(tempDbTable, tempDbTablePath, stmt); LOG.info("Finished HoodieReader execution"); } catch (SQLException e) { LOG.error("Exception when executing SQL", e); throw new IOException("Could not scan " + config.sourceTable + " incrementally", e); } finally { try { if (stmt != null) { stmt.close(); } } catch (SQLException e) { LOG.error("Could not close the resultSet opened ", e); } } } private void executeIncrementalSQL(String tempDbTable, String tempDbTablePath, Statement stmt) throws FileNotFoundException, SQLException { incrementalPullSQLTemplate.add("tempDbTable", tempDbTable); incrementalPullSQLTemplate.add("tempDbTablePath", tempDbTablePath); String storedAsClause = getStoredAsClause(); incrementalPullSQLTemplate.add("storedAsClause", storedAsClause); String incrementalSQL = new Scanner(new File(config.incrementalSQLFile)).useDelimiter("\\Z").next(); if (!incrementalSQL.contains(config.sourceDb + "." + config.sourceTable)) { LOG.error("Incremental SQL does not have " + config.sourceDb + "." + config.sourceTable + ", which means its pulling from a different table. Fencing this from happening."); throw new HoodieIncrementalPullSQLException( "Incremental SQL does not have " + config.sourceDb + "." + config.sourceTable); } if (!incrementalSQL.contains("`_hoodie_commit_time` > '%s'")) { LOG.error("Incremental SQL : " + incrementalSQL + " does not contain `_hoodie_commit_time` > '%s'. Please add " + "this clause for incremental to work properly."); throw new HoodieIncrementalPullSQLException( "Incremental SQL does not have clause `_hoodie_commit_time` > '%s', which " + "means its not pulling incrementally"); } incrementalPullSQLTemplate.add("incrementalSQL", String.format(incrementalSQL, config.fromCommitTime)); String sql = incrementalPullSQLTemplate.render(); // Check if the SQL is pulling from the right database executeStatement(sql, stmt); } private String getStoredAsClause() { return "STORED AS AVRO"; } private void initHiveBeelineProperties(Statement stmt) throws SQLException { LOG.info("Setting up Hive JDBC Session with properties"); // set the queue executeStatement("set mapred.job.queue.name=" + config.yarnQueueName, stmt); // Set the inputFormat to HoodieCombineHiveInputFormat executeStatement("set hive.input.format=org.apache.hudi.hadoop.hive.HoodieCombineHiveInputFormat", stmt); // Allow queries without partition predicate executeStatement("set hive.strict.checks.large.query=false", stmt); // Don't gather stats for the table created executeStatement("set hive.stats.autogather=false", stmt); // Set the hoodie mode executeStatement("set hoodie." + config.sourceTable + ".consume.mode=INCREMENTAL", stmt); // Set the from commit time executeStatement("set hoodie." + config.sourceTable + ".consume.start.timestamp=" + config.fromCommitTime, stmt); // Set number of commits to pull executeStatement("set hoodie." + config.sourceTable + ".consume.max.commits=" + config.maxCommits, stmt); } private boolean deleteHDFSPath(FileSystem fs, String path) throws IOException { LOG.info("Deleting path " + path); return fs.delete(new Path(path), true); } private void executeStatement(String sql, Statement stmt) throws SQLException { LOG.info("Executing: " + sql); stmt.execute(sql); } private String inferCommitTime(FileSystem fs) throws IOException { LOG.info("FromCommitTime not specified. Trying to infer it from Hoodie table " + config.targetDb + "." + config.targetTable); String targetDataLocation = getTableLocation(config.targetDb, config.targetTable); return scanForCommitTime(fs, targetDataLocation); } private String getTableLocation(String db, String table) { ResultSet resultSet = null; Statement stmt = null; try { Connection conn = getConnection(); stmt = conn.createStatement(); resultSet = stmt.executeQuery("describe formatted `" + db + "." + table + "`"); while (resultSet.next()) { if (resultSet.getString(1).trim().equals("Location:")) { LOG.info("Inferred table location for " + db + "." + table + " as " + resultSet.getString(2)); return resultSet.getString(2); } } } catch (SQLException e) { throw new HoodieIncrementalPullException("Failed to get data location for table " + db + "." + table, e); } finally { try { if (stmt != null) { stmt.close(); } if (resultSet != null) { resultSet.close(); } } catch (SQLException e) { LOG.error("Could not close the resultSet opened ", e); } } return null; } private String scanForCommitTime(FileSystem fs, String targetDataPath) throws IOException { if (targetDataPath == null) { throw new IllegalArgumentException("Please specify either --fromCommitTime or --targetDataPath"); } if (!fs.exists(new Path(targetDataPath)) || !fs.exists(new Path(targetDataPath + "/.hoodie"))) { return "0"; } HoodieTableMetaClient metadata = HoodieTableMetaClient.builder() .setConf(HadoopFSUtils.getStorageConfWithCopy(fs.getConf())).setBasePath(targetDataPath).build(); Option lastCommit = metadata.getActiveTimeline().getCommitsTimeline().filterCompletedInstants().lastInstant(); if (lastCommit.isPresent()) { return lastCommit.get().requestedTime(); } return "0"; } private boolean ensureTempPathExists(FileSystem fs, String lastCommitTime) throws IOException { Path targetBaseDirPath = new Path(config.hoodieTmpDir, config.targetTable + "__" + config.sourceTable); if (!fs.exists(targetBaseDirPath)) { LOG.info("Creating " + targetBaseDirPath + " with permission drwxrwxrwx"); boolean result = FileSystem.mkdirs(fs, targetBaseDirPath, new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL)); if (!result) { throw new HoodieException("Could not create " + targetBaseDirPath + " with the required permissions"); } } Path targetPath = new Path(targetBaseDirPath, lastCommitTime); if (fs.exists(targetPath)) { boolean result = fs.delete(targetPath, true); if (!result) { throw new HoodieException("Could not delete existing " + targetPath); } } LOG.info("Creating " + targetPath + " with permission drwxrwxrwx"); return FileSystem.mkdirs(fs, targetBaseDirPath, new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL)); } private String getLastCommitTimePulled(FileSystem fs, String sourceTableLocation) { HoodieTableMetaClient metadata = HoodieTableMetaClient.builder() .setConf(HadoopFSUtils.getStorageConfWithCopy(fs.getConf())) .setBasePath(sourceTableLocation).build(); List commitsToSync = metadata.getActiveTimeline().getCommitsTimeline().filterCompletedInstants() .findInstantsAfter(config.fromCommitTime, config.maxCommits).getInstantsAsStream().map(HoodieInstant::requestedTime) .collect(Collectors.toList()); if (commitsToSync.isEmpty()) { LOG.warn( "Nothing to sync. All commits in " + config.sourceTable + " are " + metadata.getActiveTimeline().getCommitsTimeline() .filterCompletedInstants().getInstants() + " and from commit time is " + config.fromCommitTime); return null; } LOG.info("Syncing commits " + commitsToSync); return commitsToSync.get(commitsToSync.size() - 1); } private Connection getConnection() throws SQLException { if (connection == null) { LOG.info("Getting Hive Connection to " + config.hiveJDBCUrl); this.connection = DriverManager.getConnection(config.hiveJDBCUrl, config.hiveUsername, config.hivePassword); } return connection; } public static void main(String[] args) throws IOException { final Config cfg = new Config(); JCommander cmd = new JCommander(cfg, null, args); if (cfg.help || args.length == 0) { cmd.usage(); System.exit(1); } new HiveIncrementalPuller(cfg).saveDelta(); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy