All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.projectnessie.versioned.tiered.gc.RefToL2Producer Maven / Gradle / Ivy

There is a newer version: 0.9.2
Show newest version
/*
 * Copyright (C) 2020 Dremio
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.projectnessie.versioned.tiered.gc;

import static org.apache.spark.sql.functions.col;
import static org.apache.spark.sql.functions.explode;

import java.util.function.Supplier;
import org.apache.spark.sql.AnalysisException;
import org.apache.spark.sql.Column;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.functions;
import org.apache.spark.sql.types.DataTypes;
import org.projectnessie.versioned.gc.BinaryBloomFilter;
import org.projectnessie.versioned.store.Store;

/**
 * Given a particular ref, generate all the referenced L2s from the ref.
 *
 * 

This loads the ref, figures out the last persisted L1, then recursively iterates through the * L1s parents until we find that L1s are no longer valid. For each valid L1, we output all the L1s. */ class RefToL2Producer { public static BinaryBloomFilter getL2BloomFilter( SparkSession spark, Supplier store, long maxAgeMicros, long slopMicros, long size) throws AnalysisException { final Dataset refs = RefFrame.asDataset(store, spark).select("name", "id"); // expose a table of all l1s. Dataset l1s = L1Frame.asDataset(store, spark); l1s.createOrReplaceTempView("l1"); // create a potential view for l1s to consider. refs.select("id", "name").createOrReplaceTempView("potential"); Dataset referencedL1s = null; // schema is id + children. // iterate over child -> parent relationships until we get to the root of each valid ref. while (true) { // the potential items joined with the available l1s. final Dataset joined = spark.sql( "SELECT potential.name, l1.id, l1.parents, l1.children FROM potential JOIN l1 ON potential.id = l1.id"); joined.createOrReplaceTempView("joined"); if (referencedL1s == null) { // append direct references to the list of valid l1s (the rest of the algorithm below is // focused // on parents of the current reference). referencedL1s = joined.select("id", "children"); } // for each l1, find all of the parents of that l1 and filter them based on the gc policy. final Dataset exploded = spark .sql( "SELECT name, exploded.parents.id AS id, exploded.parents.recurse AS recurse, l1.dt, l1.children FROM " + "(SELECT name, explode(parents) AS parents FROM joined) as exploded " + "JOIN l1 ON exploded.parents.id = l1.id") .filter(gc(maxAgeMicros)); // if we didn't find any more children, terminate the loop. if (exploded.count() == 0) { break; } // append the additional l1s to the existing list of referenced l1s. referencedL1s = referencedL1s.unionAll(exploded.select("id", "children")); // replace the first potential list with this new potential list so we can loop and recurse // Important, the loop variable here is effectively a catalog view name. exploded.filter("recurse = true").select("id", "name").createOrReplaceTempView("potential"); // TODO: put a break if loops too many times. // The maximum possible loop count should be longest commit history divided by // ParentList.MAX_PARENT_LIST_SIZE. } // find any l1 that is within the time slop and thus should always be considered referenced. final long recentL1s = slopMicros; Dataset slopL1s = l1s.filter(String.format("dt > %d", recentL1s)).select("id", "children"); // build a bloomfilter of all referenced l2's by exploding the tree of each l1's children. final Dataset ids = referencedL1s .unionAll(slopL1s) .withColumn("children", explode(col("children"))) .withColumn("id", col("children.id")); return BinaryBloomFilter.aggregate(ids, "id"); } static Column gc(long maxAgeMicros) { return functions .udf(new GcPolicy(maxAgeMicros), DataTypes.BooleanType) .apply(col("name"), col("dt")); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy