org.apache.iceberg.spark.SparkDataFile Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of iceberg-spark-3.3_2.13 Show documentation
Show all versions of iceberg-spark-3.3_2.13 Show documentation
A table format for huge analytic datasets
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg.spark;
import java.nio.ByteBuffer;
import java.util.List;
import java.util.Map;
import org.apache.iceberg.DataFile;
import org.apache.iceberg.FileFormat;
import org.apache.iceberg.StructLike;
import org.apache.iceberg.relocated.com.google.common.collect.Maps;
import org.apache.iceberg.types.Type;
import org.apache.iceberg.types.Types;
import org.apache.iceberg.util.StructProjection;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.types.StructType;
public class SparkDataFile implements DataFile {
private final int filePathPosition;
private final int fileFormatPosition;
private final int partitionPosition;
private final int recordCountPosition;
private final int fileSizeInBytesPosition;
private final int columnSizesPosition;
private final int valueCountsPosition;
private final int nullValueCountsPosition;
private final int nanValueCountsPosition;
private final int lowerBoundsPosition;
private final int upperBoundsPosition;
private final int keyMetadataPosition;
private final int splitOffsetsPosition;
private final int sortOrderIdPosition;
private final Type lowerBoundsType;
private final Type upperBoundsType;
private final Type keyMetadataType;
private final SparkStructLike wrappedPartition;
private final StructLike partitionProjection;
private Row wrapped;
public SparkDataFile(Types.StructType type, StructType sparkType) {
this(type, null, sparkType);
}
public SparkDataFile(
Types.StructType type, Types.StructType projectedType, StructType sparkType) {
this.lowerBoundsType = type.fieldType("lower_bounds");
this.upperBoundsType = type.fieldType("upper_bounds");
this.keyMetadataType = type.fieldType("key_metadata");
Types.StructType partitionType = type.fieldType("partition").asStructType();
this.wrappedPartition = new SparkStructLike(partitionType);
if (projectedType != null) {
Types.StructType projectedPartitionType = projectedType.fieldType("partition").asStructType();
this.partitionProjection =
StructProjection.create(partitionType, projectedPartitionType).wrap(wrappedPartition);
} else {
this.partitionProjection = wrappedPartition;
}
Map positions = Maps.newHashMap();
type.fields()
.forEach(
field -> {
String fieldName = field.name();
positions.put(fieldName, fieldPosition(fieldName, sparkType));
});
filePathPosition = positions.get("file_path");
fileFormatPosition = positions.get("file_format");
partitionPosition = positions.get("partition");
recordCountPosition = positions.get("record_count");
fileSizeInBytesPosition = positions.get("file_size_in_bytes");
columnSizesPosition = positions.get("column_sizes");
valueCountsPosition = positions.get("value_counts");
nullValueCountsPosition = positions.get("null_value_counts");
nanValueCountsPosition = positions.get("nan_value_counts");
lowerBoundsPosition = positions.get("lower_bounds");
upperBoundsPosition = positions.get("upper_bounds");
keyMetadataPosition = positions.get("key_metadata");
splitOffsetsPosition = positions.get("split_offsets");
sortOrderIdPosition = positions.get("sort_order_id");
}
public SparkDataFile wrap(Row row) {
this.wrapped = row;
if (wrappedPartition.size() > 0) {
this.wrappedPartition.wrap(row.getAs(partitionPosition));
}
return this;
}
@Override
public Long pos() {
return null;
}
@Override
public int specId() {
return -1;
}
@Override
public CharSequence path() {
return wrapped.getAs(filePathPosition);
}
@Override
public FileFormat format() {
return FileFormat.fromString(wrapped.getString(fileFormatPosition));
}
@Override
public StructLike partition() {
return partitionProjection;
}
@Override
public long recordCount() {
return wrapped.getAs(recordCountPosition);
}
@Override
public long fileSizeInBytes() {
return wrapped.getAs(fileSizeInBytesPosition);
}
@Override
public Map columnSizes() {
return wrapped.isNullAt(columnSizesPosition) ? null : wrapped.getJavaMap(columnSizesPosition);
}
@Override
public Map valueCounts() {
return wrapped.isNullAt(valueCountsPosition) ? null : wrapped.getJavaMap(valueCountsPosition);
}
@Override
public Map nullValueCounts() {
return wrapped.isNullAt(nullValueCountsPosition)
? null
: wrapped.getJavaMap(nullValueCountsPosition);
}
@Override
public Map nanValueCounts() {
return wrapped.isNullAt(nanValueCountsPosition)
? null
: wrapped.getJavaMap(nanValueCountsPosition);
}
@Override
public Map lowerBounds() {
Map, ?> lowerBounds =
wrapped.isNullAt(lowerBoundsPosition) ? null : wrapped.getJavaMap(lowerBoundsPosition);
return convert(lowerBoundsType, lowerBounds);
}
@Override
public Map upperBounds() {
Map, ?> upperBounds =
wrapped.isNullAt(upperBoundsPosition) ? null : wrapped.getJavaMap(upperBoundsPosition);
return convert(upperBoundsType, upperBounds);
}
@Override
public ByteBuffer keyMetadata() {
return convert(keyMetadataType, wrapped.get(keyMetadataPosition));
}
@Override
public DataFile copy() {
throw new UnsupportedOperationException("Not implemented: copy");
}
@Override
public DataFile copyWithoutStats() {
throw new UnsupportedOperationException("Not implemented: copyWithoutStats");
}
@Override
public List splitOffsets() {
return wrapped.isNullAt(splitOffsetsPosition) ? null : wrapped.getList(splitOffsetsPosition);
}
@Override
public Integer sortOrderId() {
return wrapped.getAs(sortOrderIdPosition);
}
private int fieldPosition(String name, StructType sparkType) {
try {
return sparkType.fieldIndex(name);
} catch (IllegalArgumentException e) {
// the partition field is absent for unpartitioned tables
if (name.equals("partition") && wrappedPartition.size() == 0) {
return -1;
}
throw e;
}
}
@SuppressWarnings("unchecked")
private T convert(Type valueType, Object value) {
return (T) SparkValueConverter.convert(valueType, value);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy