org.apache.iceberg.spark.source.BaseDataReader Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of iceberg-spark3 Show documentation
Show all versions of iceberg-spark3 Show documentation
A table format for huge analytic datasets
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg.spark.source;
import java.io.Closeable;
import java.io.IOException;
import java.math.BigDecimal;
import java.nio.ByteBuffer;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.stream.Stream;
import org.apache.avro.generic.GenericData;
import org.apache.avro.util.Utf8;
import org.apache.iceberg.CombinedScanTask;
import org.apache.iceberg.FileScanTask;
import org.apache.iceberg.MetadataColumns;
import org.apache.iceberg.Partitioning;
import org.apache.iceberg.Schema;
import org.apache.iceberg.StructLike;
import org.apache.iceberg.Table;
import org.apache.iceberg.encryption.EncryptedFiles;
import org.apache.iceberg.encryption.EncryptedInputFile;
import org.apache.iceberg.io.CloseableIterator;
import org.apache.iceberg.io.InputFile;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
import org.apache.iceberg.relocated.com.google.common.collect.Maps;
import org.apache.iceberg.types.Type;
import org.apache.iceberg.types.Types.NestedField;
import org.apache.iceberg.types.Types.StructType;
import org.apache.iceberg.util.ByteBuffers;
import org.apache.iceberg.util.PartitionUtil;
import org.apache.spark.rdd.InputFileBlockHolder;
import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
import org.apache.spark.sql.types.Decimal;
import org.apache.spark.unsafe.types.UTF8String;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Base class of Spark readers.
*
* @param is the Java class returned by this reader whose objects contain one or more rows.
*/
abstract class BaseDataReader implements Closeable {
private static final Logger LOG = LoggerFactory.getLogger(BaseDataReader.class);
private final Table table;
private final Iterator tasks;
private final Map inputFiles;
private CloseableIterator currentIterator;
private T current = null;
private FileScanTask currentTask = null;
BaseDataReader(Table table, CombinedScanTask task) {
this.table = table;
this.tasks = task.files().iterator();
Map keyMetadata = Maps.newHashMap();
task.files().stream()
.flatMap(fileScanTask -> Stream.concat(Stream.of(fileScanTask.file()), fileScanTask.deletes().stream()))
.forEach(file -> keyMetadata.put(file.path().toString(), file.keyMetadata()));
Stream encrypted = keyMetadata.entrySet().stream()
.map(entry -> EncryptedFiles.encryptedInput(table.io().newInputFile(entry.getKey()), entry.getValue()));
// decrypt with the batch call to avoid multiple RPCs to a key server, if possible
Iterable decryptedFiles = table.encryption().decrypt(encrypted::iterator);
Map files = Maps.newHashMapWithExpectedSize(task.files().size());
decryptedFiles.forEach(decrypted -> files.putIfAbsent(decrypted.location(), decrypted));
this.inputFiles = ImmutableMap.copyOf(files);
this.currentIterator = CloseableIterator.empty();
}
public boolean next() throws IOException {
try {
while (true) {
if (currentIterator.hasNext()) {
this.current = currentIterator.next();
return true;
} else if (tasks.hasNext()) {
this.currentIterator.close();
this.currentTask = tasks.next();
this.currentIterator = open(currentTask);
} else {
this.currentIterator.close();
return false;
}
}
} catch (IOException | RuntimeException e) {
if (currentTask != null && !currentTask.isDataTask()) {
LOG.error("Error reading file: {}", getInputFile(currentTask).location(), e);
}
throw e;
}
}
public T get() {
return current;
}
abstract CloseableIterator open(FileScanTask task);
@Override
public void close() throws IOException {
InputFileBlockHolder.unset();
// close the current iterator
this.currentIterator.close();
// exhaust the task iterator
while (tasks.hasNext()) {
tasks.next();
}
}
protected InputFile getInputFile(FileScanTask task) {
Preconditions.checkArgument(!task.isDataTask(), "Invalid task type");
return inputFiles.get(task.file().path().toString());
}
protected InputFile getInputFile(String location) {
return inputFiles.get(location);
}
protected Map constantsMap(FileScanTask task, Schema readSchema) {
if (readSchema.findField(MetadataColumns.PARTITION_COLUMN_ID) != null) {
StructType partitionType = Partitioning.partitionType(table);
return PartitionUtil.constantsMap(task, partitionType, BaseDataReader::convertConstant);
} else {
return PartitionUtil.constantsMap(task, BaseDataReader::convertConstant);
}
}
protected static Object convertConstant(Type type, Object value) {
if (value == null) {
return null;
}
switch (type.typeId()) {
case DECIMAL:
return Decimal.apply((BigDecimal) value);
case STRING:
if (value instanceof Utf8) {
Utf8 utf8 = (Utf8) value;
return UTF8String.fromBytes(utf8.getBytes(), 0, utf8.getByteLength());
}
return UTF8String.fromString(value.toString());
case FIXED:
if (value instanceof byte[]) {
return value;
} else if (value instanceof GenericData.Fixed) {
return ((GenericData.Fixed) value).bytes();
}
return ByteBuffers.toByteArray((ByteBuffer) value);
case BINARY:
return ByteBuffers.toByteArray((ByteBuffer) value);
case STRUCT:
StructType structType = (StructType) type;
if (structType.fields().isEmpty()) {
return new GenericInternalRow();
}
List fields = structType.fields();
Object[] values = new Object[fields.size()];
StructLike struct = (StructLike) value;
for (int index = 0; index < fields.size(); index++) {
NestedField field = fields.get(index);
Type fieldType = field.type();
values[index] = convertConstant(fieldType, struct.get(index, fieldType.typeId().javaClass()));
}
return new GenericInternalRow(values);
default:
}
return value;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy