org.apache.iceberg.spark.SparkCompressionUtil Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of iceberg-spark-3.5_2.13 Show documentation
Show all versions of iceberg-spark-3.5_2.13 Show documentation
A table format for huge analytic datasets
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg.spark;
import java.util.Locale;
import java.util.Map;
import org.apache.iceberg.FileFormat;
import org.apache.iceberg.relocated.com.google.common.collect.Maps;
import org.apache.iceberg.util.Pair;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.SparkSession;
class SparkCompressionUtil {
private static final String LZ4 = "lz4";
private static final String ZSTD = "zstd";
private static final String GZIP = "gzip";
private static final String ZLIB = "zlib";
private static final String SNAPPY = "snappy";
private static final String NONE = "none";
// an internal Spark config that controls whether shuffle data is compressed
private static final String SHUFFLE_COMPRESSION_ENABLED = "spark.shuffle.compress";
private static final boolean SHUFFLE_COMPRESSION_ENABLED_DEFAULT = true;
// an internal Spark config that controls what compression codec is used
private static final String SPARK_COMPRESSION_CODEC = "spark.io.compression.codec";
private static final String SPARK_COMPRESSION_CODEC_DEFAULT = "lz4";
private static final double DEFAULT_COLUMNAR_COMPRESSION = 2;
private static final Map, Double> COLUMNAR_COMPRESSIONS =
initColumnarCompressions();
private static final double DEFAULT_ROW_BASED_COMPRESSION = 1;
private static final Map, Double> ROW_BASED_COMPRESSIONS =
initRowBasedCompressions();
private SparkCompressionUtil() {}
/**
* Estimates how much the data in shuffle map files will compress once it is written to disk using
* a particular file format and codec.
*/
public static double shuffleCompressionRatio(
SparkSession spark, FileFormat outputFileFormat, String outputCodec) {
if (outputFileFormat == FileFormat.ORC || outputFileFormat == FileFormat.PARQUET) {
return columnarCompression(shuffleCodec(spark), outputCodec);
} else if (outputFileFormat == FileFormat.AVRO) {
return rowBasedCompression(shuffleCodec(spark), outputCodec);
} else {
return 1.0;
}
}
private static String shuffleCodec(SparkSession spark) {
SparkConf sparkConf = spark.sparkContext().conf();
return shuffleCompressionEnabled(sparkConf) ? sparkCodec(sparkConf) : NONE;
}
private static boolean shuffleCompressionEnabled(SparkConf sparkConf) {
return sparkConf.getBoolean(SHUFFLE_COMPRESSION_ENABLED, SHUFFLE_COMPRESSION_ENABLED_DEFAULT);
}
private static String sparkCodec(SparkConf sparkConf) {
return sparkConf.get(SPARK_COMPRESSION_CODEC, SPARK_COMPRESSION_CODEC_DEFAULT);
}
private static double columnarCompression(String shuffleCodec, String outputCodec) {
Pair key = Pair.of(normalize(shuffleCodec), normalize(outputCodec));
return COLUMNAR_COMPRESSIONS.getOrDefault(key, DEFAULT_COLUMNAR_COMPRESSION);
}
private static double rowBasedCompression(String shuffleCodec, String outputCodec) {
Pair key = Pair.of(normalize(shuffleCodec), normalize(outputCodec));
return ROW_BASED_COMPRESSIONS.getOrDefault(key, DEFAULT_ROW_BASED_COMPRESSION);
}
private static String normalize(String value) {
return value != null ? value.toLowerCase(Locale.ROOT) : null;
}
private static Map, Double> initColumnarCompressions() {
Map, Double> compressions = Maps.newHashMap();
compressions.put(Pair.of(NONE, ZSTD), 4.0);
compressions.put(Pair.of(NONE, GZIP), 4.0);
compressions.put(Pair.of(NONE, ZLIB), 4.0);
compressions.put(Pair.of(NONE, SNAPPY), 3.0);
compressions.put(Pair.of(NONE, LZ4), 3.0);
compressions.put(Pair.of(ZSTD, ZSTD), 2.0);
compressions.put(Pair.of(ZSTD, GZIP), 2.0);
compressions.put(Pair.of(ZSTD, ZLIB), 2.0);
compressions.put(Pair.of(ZSTD, SNAPPY), 1.5);
compressions.put(Pair.of(ZSTD, LZ4), 1.5);
compressions.put(Pair.of(SNAPPY, ZSTD), 3.0);
compressions.put(Pair.of(SNAPPY, GZIP), 3.0);
compressions.put(Pair.of(SNAPPY, ZLIB), 3.0);
compressions.put(Pair.of(SNAPPY, SNAPPY), 2.0);
compressions.put(Pair.of(SNAPPY, LZ4), 2.);
compressions.put(Pair.of(LZ4, ZSTD), 3.0);
compressions.put(Pair.of(LZ4, GZIP), 3.0);
compressions.put(Pair.of(LZ4, ZLIB), 3.0);
compressions.put(Pair.of(LZ4, SNAPPY), 2.0);
compressions.put(Pair.of(LZ4, LZ4), 2.0);
return compressions;
}
private static Map, Double> initRowBasedCompressions() {
Map, Double> compressions = Maps.newHashMap();
compressions.put(Pair.of(NONE, ZSTD), 2.0);
compressions.put(Pair.of(NONE, GZIP), 2.0);
compressions.put(Pair.of(NONE, ZLIB), 2.0);
compressions.put(Pair.of(ZSTD, SNAPPY), 0.5);
compressions.put(Pair.of(ZSTD, LZ4), 0.5);
compressions.put(Pair.of(SNAPPY, ZSTD), 1.5);
compressions.put(Pair.of(SNAPPY, GZIP), 1.5);
compressions.put(Pair.of(SNAPPY, ZLIB), 1.5);
compressions.put(Pair.of(LZ4, ZSTD), 1.5);
compressions.put(Pair.of(LZ4, GZIP), 1.5);
compressions.put(Pair.of(LZ4, ZLIB), 1.5);
return compressions;
}
}