org.apache.hudi.client.SparkRDDReadClient Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.client;
import org.apache.hudi.avro.model.HoodieCompactionPlan;
import org.apache.hudi.client.common.HoodieSparkEngineContext;
import org.apache.hudi.common.model.HoodieAvroRecord;
import org.apache.hudi.common.model.HoodieBaseFile;
import org.apache.hudi.common.model.HoodieFileFormat;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.util.CompactionUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.config.HoodieIndexConfig;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.data.HoodieJavaRDD;
import org.apache.hudi.exception.HoodieIndexException;
import org.apache.hudi.index.HoodieIndex;
import org.apache.hudi.index.SparkHoodieIndexFactory;
import org.apache.hudi.storage.StorageConfiguration;
import org.apache.hudi.table.HoodieSparkTable;
import org.apache.hudi.table.HoodieTable;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.StructType;
import java.io.Serializable;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import scala.Tuple2;
/**
* Provides an RDD based API for accessing/filtering Hoodie tables, based on keys.
*/
public class SparkRDDReadClient implements Serializable {
private static final long serialVersionUID = 1L;
/**
* TODO: We need to persist the index type into hoodie.properties and be able to access the index just with a simple
* base path pointing to the table. Until, then just always assume a BloomIndex
*/
private final transient HoodieIndex, ?> index;
private HoodieTable hoodieTable;
private transient Option sqlContextOpt;
private final transient HoodieSparkEngineContext context;
private final transient StorageConfiguration> storageConf;
/**
* @param basePath path to Hoodie table
*/
public SparkRDDReadClient(HoodieSparkEngineContext context, String basePath) {
this(context, HoodieWriteConfig.newBuilder().withPath(basePath)
// by default we use HoodieBloomIndex
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build());
}
/**
* @param context
* @param basePath
* @param sqlContext
*/
public SparkRDDReadClient(HoodieSparkEngineContext context, String basePath, SQLContext sqlContext) {
this(context, basePath);
this.sqlContextOpt = Option.of(sqlContext);
}
/**
* Initializes the {@link HoodieReadClient} with engine context, base path, SQL context and index type.
*
* @param context Hudi Spark engine context
* @param basePath Base path of the table
* @param sqlContext {@link SQLContext} instance
* @param indexType Hudi index type
*/
public SparkRDDReadClient(HoodieSparkEngineContext context, String basePath, SQLContext sqlContext, HoodieIndex.IndexType indexType) {
this(context, HoodieWriteConfig.newBuilder().withPath(basePath)
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(indexType).build()).build());
this.sqlContextOpt = Option.of(sqlContext);
}
/**
* @param clientConfig instance of HoodieWriteConfig
*/
public SparkRDDReadClient(HoodieSparkEngineContext context, HoodieWriteConfig clientConfig) {
this.context = context;
this.storageConf = context.getStorageConf();
final String basePath = clientConfig.getBasePath();
// Create a Hoodie table which encapsulated the commits and files visible
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder()
.setConf(storageConf.newInstance()).setBasePath(basePath).setLoadActiveTimelineOnLoad(true).build();
this.hoodieTable = HoodieSparkTable.create(clientConfig, context, metaClient);
this.index = SparkHoodieIndexFactory.createIndex(clientConfig);
this.sqlContextOpt = Option.empty();
}
/**
* Adds support for accessing Hoodie built tables from SparkSQL, as you normally would.
*
* @return SparkConf object to be used to construct the SparkContext by caller
*/
public static SparkConf addHoodieSupport(SparkConf conf) {
conf.set("spark.sql.hive.convertMetastoreParquet", "false");
return conf;
}
private void assertSqlContext() {
if (!sqlContextOpt.isPresent()) {
throw new IllegalStateException("SQLContext must be set, when performing dataframe operations");
}
}
private Option convertToDataFilePath(Option> partitionPathFileIDPair) {
if (partitionPathFileIDPair.isPresent()) {
HoodieBaseFile dataFile = hoodieTable.getBaseFileOnlyView()
.getLatestBaseFile(partitionPathFileIDPair.get().getLeft(), partitionPathFileIDPair.get().getRight()).get();
return Option.of(dataFile.getPath());
} else {
return Option.empty();
}
}
/**
* Given a bunch of hoodie keys, fetches all the individual records out as a data frame.
*
* @return a dataframe
*/
public Dataset readROView(JavaRDD hoodieKeys, int parallelism) {
assertSqlContext();
JavaPairRDD>> lookupResultRDD = checkExists(hoodieKeys);
JavaPairRDD> keyToFileRDD =
lookupResultRDD.mapToPair(r -> new Tuple2<>(r._1, convertToDataFilePath(r._2)));
List paths = keyToFileRDD.filter(keyFileTuple -> keyFileTuple._2().isPresent())
.map(keyFileTuple -> keyFileTuple._2().get()).collect();
// record locations might be same for multiple keys, so need a unique list
Set uniquePaths = new HashSet<>(paths);
Dataset originalDF = null;
// read files based on the file extension name
if (paths.size() == 0 || paths.get(0).endsWith(HoodieFileFormat.PARQUET.getFileExtension())) {
originalDF = sqlContextOpt.get().read().parquet(uniquePaths.toArray(new String[uniquePaths.size()]));
} else if (paths.get(0).endsWith(HoodieFileFormat.ORC.getFileExtension())) {
originalDF = sqlContextOpt.get().read().orc(uniquePaths.toArray(new String[uniquePaths.size()]));
}
StructType schema = originalDF.schema();
JavaPairRDD keyRowRDD = originalDF.javaRDD().mapToPair(row -> {
HoodieKey key = new HoodieKey(row.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD),
row.getAs(HoodieRecord.PARTITION_PATH_METADATA_FIELD));
return new Tuple2<>(key, row);
});
// Now, we need to further filter out, for only rows that match the supplied hoodie keys
JavaRDD rowRDD = keyRowRDD.join(keyToFileRDD, parallelism).map(tuple -> tuple._2()._1());
return sqlContextOpt.get().createDataFrame(rowRDD, schema);
}
/**
* Checks if the given [Keys] exists in the hoodie table and returns [Key, Option[FullFilePath]] If the optional
* FullFilePath value is not present, then the key is not found. If the FullFilePath value is present, it is the path
* component (without scheme) of the URI underlying file
*/
public JavaPairRDD>> checkExists(JavaRDD hoodieKeys) {
return HoodieJavaRDD.getJavaRDD(
index.tagLocation(HoodieJavaRDD.of(hoodieKeys.map(k -> new HoodieAvroRecord<>(k, null))),
context, hoodieTable))
.mapToPair(hr -> new Tuple2<>(hr.getKey(), hr.isCurrentLocationKnown()
? Option.of(Pair.of(hr.getPartitionPath(), hr.getCurrentLocation().getFileId()))
: Option.empty())
);
}
/**
* Filter out HoodieRecords that already exists in the output folder. This is useful in deduplication.
*
* @param hoodieRecords Input RDD of Hoodie records.
* @return A subset of hoodieRecords RDD, with existing records filtered out.
*/
public JavaRDD> filterExists(JavaRDD> hoodieRecords) {
JavaRDD> recordsWithLocation = tagLocation(hoodieRecords);
return recordsWithLocation.filter(v1 -> !v1.isCurrentLocationKnown());
}
/**
* Looks up the index and tags each incoming record with a location of a file that contains the row (if it is actually
* present). Input RDD should contain no duplicates if needed.
*
* @param hoodieRecords Input RDD of Hoodie records
* @return Tagged RDD of Hoodie records
*/
public JavaRDD> tagLocation(JavaRDD> hoodieRecords) throws HoodieIndexException {
return HoodieJavaRDD.getJavaRDD(
index.tagLocation(HoodieJavaRDD.of(hoodieRecords), context, hoodieTable));
}
/**
* Return all pending compactions with instant time for clients to decide what to compact next.
*
* @return
*/
public List> getPendingCompactions() {
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder()
.setConf(storageConf.newInstance()).setBasePath(hoodieTable.getMetaClient().getBasePath()).setLoadActiveTimelineOnLoad(true).build();
return CompactionUtils.getAllPendingCompactionPlans(metaClient).stream()
.map(
instantWorkloadPair -> Pair.of(instantWorkloadPair.getKey().getTimestamp(), instantWorkloadPair.getValue()))
.collect(Collectors.toList());
}
}