org.apache.iceberg.spark.SparkExecutorCache Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of iceberg-spark-3.5_2.13 Show documentation
A table format for huge analytic datasets
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.iceberg.spark;

import com.github.benmanes.caffeine.cache.Cache;
import com.github.benmanes.caffeine.cache.Caffeine;
import java.time.Duration;
import java.util.List;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.stream.Collectors;
import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * An executor cache for reducing the computation and IO overhead in tasks.
 *
 * The cache is configured and controlled through Spark SQL properties. It supports both limits
 * on the total cache size and maximum size for individual entries. Additionally, it implements
 * automatic eviction of entries after a specified duration of inactivity. The cache will respect
 * the SQL configuration valid at the time of initialization. All subsequent changes to the
 * configuration will have no effect.
 *
 * 
The cache is accessed and populated via {@link #getOrLoad(String, String, Supplier, long)}. If
 * the value is not present in the cache, it is computed using the provided supplier and stored in
 * the cache, subject to the defined size constraints. When a key is added, it must be associated
 * with a particular group ID. Once the group is no longer needed, it is recommended to explicitly
 * invalidate its state by calling {@link #invalidate(String)} instead of relying on automatic
 * eviction.
 *
 * 
Note that this class employs the singleton pattern to ensure only one cache exists per JVM.
 */
public class SparkExecutorCache {

  private static final Logger LOG = LoggerFactory.getLogger(SparkExecutorCache.class);

  private static volatile SparkExecutorCache instance = null;

  private final Duration timeout;
  private final long maxEntrySize;
  private final long maxTotalSize;
  private volatile Cache state;

  private SparkExecutorCache(Conf conf) {
    this.timeout = conf.timeout();
    this.maxEntrySize = conf.maxEntrySize();
    this.maxTotalSize = conf.maxTotalSize();
  }

  /**
   * Returns the cache if created or creates and returns it.
   *
   * Note this method returns null if caching is disabled.
   */
  public static SparkExecutorCache getOrCreate() {
    if (instance == null) {
      Conf conf = new Conf();
      if (conf.cacheEnabled()) {
        synchronized (SparkExecutorCache.class) {
          if (instance == null) {
            SparkExecutorCache.instance = new SparkExecutorCache(conf);
          }
        }
      }
    }

    return instance;
  }

  /** Returns the cache if already created or null otherwise. */
  public static SparkExecutorCache get() {
    return instance;
  }

  /** Returns the max entry size in bytes that will be considered for caching. */
  public long maxEntrySize() {
    return maxEntrySize;
  }

  /**
   * Gets the cached value for the key or populates the cache with a new mapping.
   *
   * @param group a group ID
   * @param key a cache key
   * @param valueSupplier a supplier to compute the value
   * @param valueSize an estimated memory size of the value in bytes
   * @return the cached or computed value
   */
  public  V getOrLoad(String group, String key, Supplier valueSupplier, long valueSize) {
    if (valueSize > maxEntrySize) {
      LOG.debug("{} exceeds max entry size: {} > {}", key, valueSize, maxEntrySize);
      return valueSupplier.get();
    }

    String internalKey = group + "_" + key;
    CacheValue value = state().get(internalKey, loadFunc(valueSupplier, valueSize));
    Preconditions.checkNotNull(value, "Loaded value must not be null");
    return value.get();
  }

  private  Function loadFunc(Supplier valueSupplier, long valueSize) {
    return key -> {
      long start = System.currentTimeMillis();
      V value = valueSupplier.get();
      long end = System.currentTimeMillis();
      LOG.debug("Loaded {} with size {} in {} ms", key, valueSize, (end - start));
      return new CacheValue(value, valueSize);
    };
  }

  /**
   * Invalidates all keys associated with the given group ID.
   *
   * @param group a group ID
   */
  public void invalidate(String group) {
    if (state != null) {
      List internalKeys = findInternalKeys(group);
      LOG.info("Invalidating {} keys associated with {}", internalKeys.size(), group);
      internalKeys.forEach(internalKey -> state.invalidate(internalKey));
      LOG.info("Current cache stats {}", state.stats());
    }
  }

  private List findInternalKeys(String group) {
    return state.asMap().keySet().stream()
        .filter(internalKey -> internalKey.startsWith(group))
        .collect(Collectors.toList());
  }

  private Cache state() {
    if (state == null) {
      synchronized (this) {
        if (state == null) {
          LOG.info("Initializing cache state");
          this.state = initState();
        }
      }
    }

    return state;
  }

  private Cache initState() {
    return Caffeine.newBuilder()
        .expireAfterAccess(timeout)
        .maximumWeight(maxTotalSize)
        .weigher((key, value) -> ((CacheValue) value).weight())
        .recordStats()
        .removalListener((key, value, cause) -> LOG.debug("Evicted {} ({})", key, cause))
        .build();
  }

  @VisibleForTesting
  static class CacheValue {
    private final Object value;
    private final long size;

    CacheValue(Object value, long size) {
      this.value = value;
      this.size = size;
    }

    @SuppressWarnings("unchecked")
    public  V get() {
      return (V) value;
    }

    public int weight() {
      return (int) Math.min(size, Integer.MAX_VALUE);
    }
  }

  @VisibleForTesting
  static class Conf {
    private final SparkConfParser confParser = new SparkConfParser();

    public boolean cacheEnabled() {
      return confParser
          .booleanConf()
          .sessionConf(SparkSQLProperties.EXECUTOR_CACHE_ENABLED)
          .defaultValue(SparkSQLProperties.EXECUTOR_CACHE_ENABLED_DEFAULT)
          .parse();
    }

    public Duration timeout() {
      return confParser
          .durationConf()
          .sessionConf(SparkSQLProperties.EXECUTOR_CACHE_TIMEOUT)
          .defaultValue(SparkSQLProperties.EXECUTOR_CACHE_TIMEOUT_DEFAULT)
          .parse();
    }

    public long maxEntrySize() {
      return confParser
          .longConf()
          .sessionConf(SparkSQLProperties.EXECUTOR_CACHE_MAX_ENTRY_SIZE)
          .defaultValue(SparkSQLProperties.EXECUTOR_CACHE_MAX_ENTRY_SIZE_DEFAULT)
          .parse();
    }

    public long maxTotalSize() {
      return confParser
          .longConf()
          .sessionConf(SparkSQLProperties.EXECUTOR_CACHE_MAX_TOTAL_SIZE)
          .defaultValue(SparkSQLProperties.EXECUTOR_CACHE_MAX_TOTAL_SIZE_DEFAULT)
          .parse();
    }
  }
}