All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.projectnessie.versioned.tiered.gc.IdProducer Maven / Gradle / Ivy

There is a newer version: 0.9.2
Show newest version
/*
 * Copyright (C) 2020 Dremio
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.projectnessie.versioned.tiered.gc;

import static org.apache.spark.sql.functions.col;
import static org.apache.spark.sql.functions.explode;

import java.io.Serializable;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.function.Supplier;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.projectnessie.versioned.gc.BinaryBloomFilter;
import org.projectnessie.versioned.store.Store;
import org.projectnessie.versioned.store.Store.Acceptor;
import org.projectnessie.versioned.store.ValueType;
import org.projectnessie.versioned.tiered.BaseValue;
import org.sparkproject.guava.collect.Iterators;

/**
 * Convert an item to valid version of it's children.
 *
 * 

Given a particular item and a bloomfilter listing valid ids, determine if the current item is * valid. If the current item is valid, generate all the referenced children ids. */ final class IdProducer { /** * Given a bloom filter and list of items that can produce addition Ids, get the next bloom * filter. */ public static > BinaryBloomFilter getNextBloomFilter( SparkSession spark, BinaryBloomFilter idFilter, ValueType valueType, Supplier store, long targetCount, Function, IdCarrier> converter) { Predicate predicate = t -> idFilter.mightContain(t.getId().getId()); Dataset carriers = IdCarrier.asDataset(valueType, store, converter, Optional.of(predicate), spark); return BinaryBloomFilter.aggregate( carriers.withColumn("id", explode(col("children"))), "id.id"); } /** * Given a bloom filter and list of items that can produce addition Ids, get the next bloom * filter. */ public static > Dataset getKeys( SparkSession spark, ValueType valueType, Supplier store, Function, IdCarrier> converter) { Dataset carriers = IdCarrier.asDataset(valueType, store, converter, Optional.empty(), spark); return carriers.flatMap(new KeyPairFlatMap(), Encoders.bean(IdKeyPair.class)).distinct(); } public static class KeyPairFlatMap implements FlatMapFunction { @Override public Iterator call(IdCarrier idCarrier) throws Exception { if (idCarrier.getChildKeys() == null || idCarrier.getChildKeys().isEmpty()) { return Iterators.emptyIterator(); } return idCarrier.getChildKeys().entrySet().stream().map(IdKeyPair::of).iterator(); } } public static class IdKeyPair implements Serializable { private IdFrame id; private List key; public IdKeyPair() {} public IdKeyPair(IdFrame id, List key) { this.id = id; this.key = key; } public static IdKeyPair of(Map.Entry> kv) { return new IdKeyPair(kv.getKey(), kv.getValue()); } public IdFrame getId() { return id; } public void setId(IdFrame id) { this.id = id; } public List getKey() { return key; } public void setKey(List key) { this.key = key; } @Override public boolean equals(Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } IdKeyPair idKeyPair = (IdKeyPair) o; return id.equals(idKeyPair.id); } @Override public int hashCode() { return Objects.hash(id); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy