Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.function.Supplier;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import org.apache.iceberg.expressions.Expression;
import org.apache.iceberg.io.CloseableIterable;
import org.apache.iceberg.io.ClosingIterator;
import org.apache.iceberg.io.FileIO;
import org.apache.iceberg.metrics.MetricsReporter;
import org.apache.iceberg.relocated.com.google.common.base.Joiner;
import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.spark.JobGroupInfo;
import org.apache.iceberg.spark.JobGroupUtils;
import org.apache.iceberg.spark.SparkReadConf;
import org.apache.iceberg.spark.actions.ManifestFileBean;
import org.apache.iceberg.spark.source.SerializableTableWithSize;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.broadcast.Broadcast;
import org.apache.spark.sql.SparkSession;
/**
* A batch data scan that can utilize Spark cluster resources for planning.
*
*
This scan remotely filters manifests, fetching only the relevant data and delete files to the
* driver. The delete file assignment is done locally after the remote filtering step. Such approach
* is beneficial if the remote parallelism is much higher than the number of driver cores.
*
*
This scan is best suited for queries with selective filters on lower/upper bounds across all
* partitions, or against poorly clustered metadata. This allows job planning to benefit from highly
* concurrent remote filtering while not incurring high serialization and data transfer costs. This
* class is also useful for full table scans over large tables but the cost of bringing data and
* delete file details to the driver may become noticeable. Make sure to follow the performance tips
* below in such cases.
*
*
Ensure the filtered metadata size doesn't exceed the driver's max result size. For large table
* scans, consider increasing `spark.driver.maxResultSize` to avoid job failures.
*
*
Performance tips:
*
*
*
Enable Kryo serialization (`spark.serializer`)
*
Increase the number of driver cores (`spark.driver.cores`)
*
Tune the number of threads used to fetch task results (`spark.resultGetter.threads`)
*
*/
public class SparkDistributedDataScan extends BaseDistributedDataScan {
private static final Joiner COMMA = Joiner.on(',');
private static final String DELETE_PLANNING_JOB_GROUP_ID = "DELETE-PLANNING";
private static final String DATA_PLANNING_JOB_GROUP_ID = "DATA-PLANNING";
private final SparkSession spark;
private final JavaSparkContext sparkContext;
private final SparkReadConf readConf;
private Broadcast