org.apache.iceberg.spark.source.BaseReader Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of iceberg-spark-3.5_2.13 Show documentation
Show all versions of iceberg-spark-3.5_2.13 Show documentation
A table format for huge analytic datasets
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg.spark.source;
import java.io.Closeable;
import java.io.IOException;
import java.math.BigDecimal;
import java.nio.ByteBuffer;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.avro.generic.GenericData;
import org.apache.avro.util.Utf8;
import org.apache.iceberg.ContentFile;
import org.apache.iceberg.ContentScanTask;
import org.apache.iceberg.DeleteFile;
import org.apache.iceberg.MetadataColumns;
import org.apache.iceberg.Partitioning;
import org.apache.iceberg.ScanTask;
import org.apache.iceberg.ScanTaskGroup;
import org.apache.iceberg.Schema;
import org.apache.iceberg.StructLike;
import org.apache.iceberg.Table;
import org.apache.iceberg.TableProperties;
import org.apache.iceberg.data.BaseDeleteLoader;
import org.apache.iceberg.data.DeleteFilter;
import org.apache.iceberg.data.DeleteLoader;
import org.apache.iceberg.deletes.DeleteCounter;
import org.apache.iceberg.encryption.EncryptingFileIO;
import org.apache.iceberg.io.CloseableIterator;
import org.apache.iceberg.io.InputFile;
import org.apache.iceberg.mapping.NameMapping;
import org.apache.iceberg.mapping.NameMappingParser;
import org.apache.iceberg.spark.SparkExecutorCache;
import org.apache.iceberg.spark.SparkSchemaUtil;
import org.apache.iceberg.types.Type;
import org.apache.iceberg.types.Types.NestedField;
import org.apache.iceberg.types.Types.StructType;
import org.apache.iceberg.util.ByteBuffers;
import org.apache.iceberg.util.PartitionUtil;
import org.apache.spark.rdd.InputFileBlockHolder;
import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
import org.apache.spark.sql.types.Decimal;
import org.apache.spark.unsafe.types.UTF8String;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Base class of Spark readers.
*
* @param is the Java class returned by this reader whose objects contain one or more rows.
*/
abstract class BaseReader implements Closeable {
private static final Logger LOG = LoggerFactory.getLogger(BaseReader.class);
private final Table table;
private final Schema tableSchema;
private final Schema expectedSchema;
private final boolean caseSensitive;
private final NameMapping nameMapping;
private final ScanTaskGroup taskGroup;
private final Iterator tasks;
private final DeleteCounter counter;
private Map lazyInputFiles;
private CloseableIterator currentIterator;
private T current = null;
private TaskT currentTask = null;
BaseReader(
Table table,
ScanTaskGroup taskGroup,
Schema tableSchema,
Schema expectedSchema,
boolean caseSensitive) {
this.table = table;
this.taskGroup = taskGroup;
this.tasks = taskGroup.tasks().iterator();
this.currentIterator = CloseableIterator.empty();
this.tableSchema = tableSchema;
this.expectedSchema = expectedSchema;
this.caseSensitive = caseSensitive;
String nameMappingString = table.properties().get(TableProperties.DEFAULT_NAME_MAPPING);
this.nameMapping =
nameMappingString != null ? NameMappingParser.fromJson(nameMappingString) : null;
this.counter = new DeleteCounter();
}
protected abstract CloseableIterator open(TaskT task);
protected abstract Stream> referencedFiles(TaskT task);
protected Schema expectedSchema() {
return expectedSchema;
}
protected boolean caseSensitive() {
return caseSensitive;
}
protected NameMapping nameMapping() {
return nameMapping;
}
protected Table table() {
return table;
}
protected DeleteCounter counter() {
return counter;
}
public boolean next() throws IOException {
try {
while (true) {
if (currentIterator.hasNext()) {
this.current = currentIterator.next();
return true;
} else if (tasks.hasNext()) {
this.currentIterator.close();
this.currentTask = tasks.next();
this.currentIterator = open(currentTask);
} else {
this.currentIterator.close();
return false;
}
}
} catch (IOException | RuntimeException e) {
if (currentTask != null && !currentTask.isDataTask()) {
String filePaths =
referencedFiles(currentTask)
.map(file -> file.path().toString())
.collect(Collectors.joining(", "));
LOG.error("Error reading file(s): {}", filePaths, e);
}
throw e;
}
}
public T get() {
return current;
}
@Override
public void close() throws IOException {
InputFileBlockHolder.unset();
// close the current iterator
this.currentIterator.close();
// exhaust the task iterator
while (tasks.hasNext()) {
tasks.next();
}
}
protected InputFile getInputFile(String location) {
return inputFiles().get(location);
}
private Map inputFiles() {
if (lazyInputFiles == null) {
this.lazyInputFiles =
EncryptingFileIO.combine(table().io(), table().encryption())
.bulkDecrypt(
() -> taskGroup.tasks().stream().flatMap(this::referencedFiles).iterator());
}
return lazyInputFiles;
}
protected Map constantsMap(ContentScanTask> task, Schema readSchema) {
if (readSchema.findField(MetadataColumns.PARTITION_COLUMN_ID) != null) {
StructType partitionType = Partitioning.partitionType(table);
return PartitionUtil.constantsMap(task, partitionType, BaseReader::convertConstant);
} else {
return PartitionUtil.constantsMap(task, BaseReader::convertConstant);
}
}
protected static Object convertConstant(Type type, Object value) {
if (value == null) {
return null;
}
switch (type.typeId()) {
case DECIMAL:
return Decimal.apply((BigDecimal) value);
case STRING:
if (value instanceof Utf8) {
Utf8 utf8 = (Utf8) value;
return UTF8String.fromBytes(utf8.getBytes(), 0, utf8.getByteLength());
}
return UTF8String.fromString(value.toString());
case FIXED:
if (value instanceof byte[]) {
return value;
} else if (value instanceof GenericData.Fixed) {
return ((GenericData.Fixed) value).bytes();
}
return ByteBuffers.toByteArray((ByteBuffer) value);
case BINARY:
return ByteBuffers.toByteArray((ByteBuffer) value);
case STRUCT:
StructType structType = (StructType) type;
if (structType.fields().isEmpty()) {
return new GenericInternalRow();
}
List fields = structType.fields();
Object[] values = new Object[fields.size()];
StructLike struct = (StructLike) value;
for (int index = 0; index < fields.size(); index++) {
NestedField field = fields.get(index);
Type fieldType = field.type();
values[index] =
convertConstant(fieldType, struct.get(index, fieldType.typeId().javaClass()));
}
return new GenericInternalRow(values);
default:
}
return value;
}
protected class SparkDeleteFilter extends DeleteFilter {
private final InternalRowWrapper asStructLike;
SparkDeleteFilter(String filePath, List deletes, DeleteCounter counter) {
super(filePath, deletes, tableSchema, expectedSchema, counter);
this.asStructLike =
new InternalRowWrapper(
SparkSchemaUtil.convert(requiredSchema()), requiredSchema().asStruct());
}
@Override
protected StructLike asStructLike(InternalRow row) {
return asStructLike.wrap(row);
}
@Override
protected InputFile getInputFile(String location) {
return BaseReader.this.getInputFile(location);
}
@Override
protected void markRowDeleted(InternalRow row) {
if (!row.getBoolean(columnIsDeletedPosition())) {
row.setBoolean(columnIsDeletedPosition(), true);
counter().increment();
}
}
@Override
protected DeleteLoader newDeleteLoader() {
return new CachingDeleteLoader(this::loadInputFile);
}
private class CachingDeleteLoader extends BaseDeleteLoader {
private final SparkExecutorCache cache;
CachingDeleteLoader(Function loadInputFile) {
super(loadInputFile);
this.cache = SparkExecutorCache.getOrCreate();
}
@Override
protected boolean canCache(long size) {
return cache != null && size < cache.maxEntrySize();
}
@Override
protected V getOrLoad(String key, Supplier valueSupplier, long valueSize) {
return cache.getOrLoad(table().name(), key, valueSupplier, valueSize);
}
}
}
}