org.apache.iceberg.spark.source.IcebergSource Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of iceberg-spark2 Show documentation
Show all versions of iceberg-spark2 Show documentation
A table format for huge analytic datasets
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg.spark.source;
import java.util.Map;
import java.util.Optional;
import org.apache.hadoop.conf.Configuration;
import org.apache.iceberg.Schema;
import org.apache.iceberg.Table;
import org.apache.iceberg.catalog.TableIdentifier;
import org.apache.iceberg.encryption.EncryptionManager;
import org.apache.iceberg.hadoop.HadoopTables;
import org.apache.iceberg.hive.HiveCatalog;
import org.apache.iceberg.hive.HiveCatalogs;
import org.apache.iceberg.io.FileIO;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.spark.SparkSchemaUtil;
import org.apache.iceberg.spark.SparkUtil;
import org.apache.iceberg.types.TypeUtil;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.broadcast.Broadcast;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.execution.streaming.StreamExecution;
import org.apache.spark.sql.sources.DataSourceRegister;
import org.apache.spark.sql.sources.v2.DataSourceOptions;
import org.apache.spark.sql.sources.v2.DataSourceV2;
import org.apache.spark.sql.sources.v2.ReadSupport;
import org.apache.spark.sql.sources.v2.StreamWriteSupport;
import org.apache.spark.sql.sources.v2.WriteSupport;
import org.apache.spark.sql.sources.v2.reader.DataSourceReader;
import org.apache.spark.sql.sources.v2.writer.DataSourceWriter;
import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter;
import org.apache.spark.sql.streaming.OutputMode;
import org.apache.spark.sql.types.StructType;
public class IcebergSource implements DataSourceV2, ReadSupport, WriteSupport, DataSourceRegister, StreamWriteSupport {
private SparkSession lazySpark = null;
private JavaSparkContext lazySparkContext = null;
private Configuration lazyConf = null;
@Override
public String shortName() {
return "iceberg";
}
@Override
public DataSourceReader createReader(DataSourceOptions options) {
return createReader(null, options);
}
@Override
public DataSourceReader createReader(StructType readSchema, DataSourceOptions options) {
Configuration conf = new Configuration(lazyBaseConf());
Table table = getTableAndResolveHadoopConfiguration(options, conf);
String caseSensitive = lazySparkSession().conf().get("spark.sql.caseSensitive");
Broadcast io = lazySparkContext().broadcast(SparkUtil.serializableFileIO(table));
Broadcast encryptionManager = lazySparkContext().broadcast(table.encryption());
Reader reader = new Reader(table, io, encryptionManager, Boolean.parseBoolean(caseSensitive), options);
if (readSchema != null) {
// convert() will fail if readSchema contains fields not in table.schema()
SparkSchemaUtil.convert(table.schema(), readSchema);
reader.pruneColumns(readSchema);
}
return reader;
}
@Override
public Optional createWriter(String jobId, StructType dsStruct, SaveMode mode,
DataSourceOptions options) {
Preconditions.checkArgument(mode == SaveMode.Append || mode == SaveMode.Overwrite,
"Save mode %s is not supported", mode);
Configuration conf = new Configuration(lazyBaseConf());
Table table = getTableAndResolveHadoopConfiguration(options, conf);
Schema writeSchema = SparkSchemaUtil.convert(table.schema(), dsStruct);
TypeUtil.validateWriteSchema(table.schema(), writeSchema, checkNullability(options), checkOrdering(options));
SparkUtil.validatePartitionTransforms(table.spec());
String appId = lazySparkSession().sparkContext().applicationId();
String wapId = lazySparkSession().conf().get("spark.wap.id", null);
boolean replacePartitions = mode == SaveMode.Overwrite;
Broadcast io = lazySparkContext().broadcast(SparkUtil.serializableFileIO(table));
Broadcast encryptionManager = lazySparkContext().broadcast(table.encryption());
return Optional.of(new Writer(
table, io, encryptionManager, options, replacePartitions, appId, wapId, writeSchema, dsStruct));
}
@Override
public StreamWriter createStreamWriter(String runId, StructType dsStruct,
OutputMode mode, DataSourceOptions options) {
Preconditions.checkArgument(
mode == OutputMode.Append() || mode == OutputMode.Complete(),
"Output mode %s is not supported", mode);
Configuration conf = new Configuration(lazyBaseConf());
Table table = getTableAndResolveHadoopConfiguration(options, conf);
Schema writeSchema = SparkSchemaUtil.convert(table.schema(), dsStruct);
TypeUtil.validateWriteSchema(table.schema(), writeSchema, checkNullability(options), checkOrdering(options));
SparkUtil.validatePartitionTransforms(table.spec());
// Spark 2.4.x passes runId to createStreamWriter instead of real queryId,
// so we fetch it directly from sparkContext to make writes idempotent
String queryId = lazySparkSession().sparkContext().getLocalProperty(StreamExecution.QUERY_ID_KEY());
String appId = lazySparkSession().sparkContext().applicationId();
Broadcast io = lazySparkContext().broadcast(SparkUtil.serializableFileIO(table));
Broadcast encryptionManager = lazySparkContext().broadcast(table.encryption());
return new StreamingWriter(table, io, encryptionManager, options, queryId, mode, appId, writeSchema, dsStruct);
}
protected Table findTable(DataSourceOptions options, Configuration conf) {
Optional path = options.get("path");
Preconditions.checkArgument(path.isPresent(), "Cannot open table: path is not set");
if (path.get().contains("/")) {
HadoopTables tables = new HadoopTables(conf);
return tables.load(path.get());
} else {
HiveCatalog hiveCatalog = HiveCatalogs.loadCatalog(conf);
TableIdentifier tableIdentifier = TableIdentifier.parse(path.get());
return hiveCatalog.loadTable(tableIdentifier);
}
}
private SparkSession lazySparkSession() {
if (lazySpark == null) {
this.lazySpark = SparkSession.builder().getOrCreate();
}
return lazySpark;
}
private JavaSparkContext lazySparkContext() {
if (lazySparkContext == null) {
this.lazySparkContext = new JavaSparkContext(lazySparkSession().sparkContext());
}
return lazySparkContext;
}
private Configuration lazyBaseConf() {
if (lazyConf == null) {
this.lazyConf = lazySparkSession().sessionState().newHadoopConf();
}
return lazyConf;
}
private Table getTableAndResolveHadoopConfiguration(
DataSourceOptions options, Configuration conf) {
// Overwrite configurations from the Spark Context with configurations from the options.
mergeIcebergHadoopConfs(conf, options.asMap());
Table table = findTable(options, conf);
// Set confs from table properties
mergeIcebergHadoopConfs(conf, table.properties());
// Re-overwrite values set in options and table properties but were not in the environment.
mergeIcebergHadoopConfs(conf, options.asMap());
return table;
}
private static void mergeIcebergHadoopConfs(
Configuration baseConf, Map options) {
options.keySet().stream()
.filter(key -> key.startsWith("hadoop."))
.forEach(key -> baseConf.set(key.replaceFirst("hadoop.", ""), options.get(key)));
}
private boolean checkNullability(DataSourceOptions options) {
boolean sparkCheckNullability = Boolean.parseBoolean(lazySpark.conf()
.get("spark.sql.iceberg.check-nullability", "true"));
boolean dataFrameCheckNullability = options.getBoolean("check-nullability", true);
return sparkCheckNullability && dataFrameCheckNullability;
}
private boolean checkOrdering(DataSourceOptions options) {
boolean sparkCheckOrdering = Boolean.parseBoolean(lazySpark.conf()
.get("spark.sql.iceberg.check-ordering", "true"));
boolean dataFrameCheckOrdering = options.getBoolean("check-ordering", true);
return sparkCheckOrdering && dataFrameCheckOrdering;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy