org.projectnessie.versioned.persist.adapter.ReferencesUtil Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of nessie-versioned-persist-adapter Show documentation
There is a newer version: 0.59.0
Show newest version
/*
 * Copyright (C) 2022 Dremio
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.projectnessie.versioned.persist.adapter;

import static org.projectnessie.versioned.persist.adapter.spi.AbstractDatabaseAdapter.NO_ANCESTOR;

import com.google.common.annotations.Beta;
import com.google.protobuf.ByteString;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.function.Consumer;
import java.util.stream.Stream;
import org.agrona.collections.Hashing;
import org.agrona.collections.Object2IntHashMap;
import org.agrona.collections.Object2ObjectHashMap;
import org.agrona.collections.ObjectHashSet;
import org.projectnessie.versioned.GetNamedRefsParams;
import org.projectnessie.versioned.Hash;
import org.projectnessie.versioned.NamedRef;
import org.projectnessie.versioned.ReferenceInfo;
import org.projectnessie.versioned.ReferenceNotFoundException;

@Beta
public final class ReferencesUtil {
  private final DatabaseAdapter databaseAdapter;

  private ReferencesUtil(DatabaseAdapter databaseAdapter) {
    this.databaseAdapter = databaseAdapter;
  }

  public static ReferencesUtil forDatabaseAdapter(DatabaseAdapter databaseAdapter) {
    return new ReferencesUtil(databaseAdapter);
  }

  private static  Map newOpenAddressingHashMap() {
    return new Object2ObjectHashMap<>(16, Hashing.DEFAULT_LOAD_FACTOR, false);
  }

  private static  Set newOpenAddressingHashSet() {
    return new ObjectHashSet<>(
        ObjectHashSet.DEFAULT_INITIAL_CAPACITY, Hashing.DEFAULT_LOAD_FACTOR, false);
  }

  private static  Set newOpenAddressingHashSet(Set source) {
    ObjectHashSet copy = new ObjectHashSet<>(source.size(), Hashing.DEFAULT_LOAD_FACTOR, false);
    if (source instanceof ObjectHashSet) {
      copy.addAll((ObjectHashSet) source);
    } else {
      copy.addAll(source);
    }
    return copy;
  }

  public static class IdentifyHeadsAndForkPoints {
    // Map contains both the commit-IDs and parent-
    private final Object2IntHashMap commits;
    private final Set heads;
    private final Set forkPoints;
    private final long scanStartedAtInMicros;

    private static final int MASK_COMMIT_SEEN = 1;
    private static final int MASK_PARENT_SEEN = 2;

    public IdentifyHeadsAndForkPoints(int expectedCommitCount, long scanStartedAtInMicros) {
      // Using open-addressing implementation here, because it's much more space-efficient than
      // java.util.HashSet.
      this.commits =
          new Object2IntHashMap<>(expectedCommitCount * 2, Hashing.DEFAULT_LOAD_FACTOR, 0, true);
      this.heads = newOpenAddressingHashSet();
      this.forkPoints = newOpenAddressingHashSet();
      this.scanStartedAtInMicros = scanStartedAtInMicros;
    }

    public boolean handleCommit(CommitLogEntry entry) {
      return handleCommit(entry.getHash(), entry.getParents().get(0));
    }

    public boolean handleCommit(Hash commitId, Hash parent) {

      int cv = commits.getValue(commitId);
      boolean commitNew = (cv & MASK_COMMIT_SEEN) == 0;
      if (commitNew) {
        commits.put(commitId, cv | MASK_COMMIT_SEEN);
      }
      boolean commitNotSeenAsParent = (cv & MASK_PARENT_SEEN) == 0;
      if (commitNotSeenAsParent) {
        // If the commit-ID has not been seen as a parent, it must be a HEAD
        heads.add(commitId);
      }

      // Only process the parent-ID when the commit has not been seen before.
      if (!commitNew) {
        return false;
      }

      // Do not handle 'no ancestor' as a "legit parent".
      if (NO_ANCESTOR.equals(parent)) {
        return true;
      }

      int pv = commits.getValue(parent);
      boolean parentNew = (pv & MASK_PARENT_SEEN) == 0;

      if (!parentNew) {
        // If "parent" has already been seen, then it must be a fork point.
        forkPoints.add(parent);
      } else {
        commits.put(parent, pv | MASK_PARENT_SEEN);
        // Commits in "parents" that are also contained in "heads" cannot be HEADs.
        // This can happen because the commits are scanned in "random order".
        heads.remove(parent);
      }

      return true;
    }

    public HeadsAndForkPoints finish() {
      return HeadsAndForkPoints.of(heads, forkPoints, scanStartedAtInMicros);
    }
  }

  /**
   * Identifies all heads and fork-points.
   *
   * 
   *   "Heads" are commits that are not referenced by other commits.
   *   
"Fork points" are commits that are the parent of more than one other commit. Knowing
   *       these commits can help to optimize the traversal of commit logs of multiple heads.
   * 
   *
   * @param expectedCommitCount it is recommended to tell the implementation the total number of
   *     commits in the Nessie repository
   * @param commitHandler called for every commit while scanning all commits
   */
  public HeadsAndForkPoints identifyAllHeadsAndForkPoints(
      int expectedCommitCount, Consumer commitHandler) {

    // Need to remember the time when the identification started, so that a follow-up
    // identifyReferencedAndUnreferencedHeads() knows when it can stop scanning a named-reference's
    // commit-log. identifyReferencedAndUnreferencedHeads() has to read up to the first commit
    // _before_ this timestamp to not commit-IDs as "unreferenced".
    //
    // Note: keep in mind, that scanAllCommitLogEntries() returns all commits in a
    // non-deterministic order. Example: if (at least) two commits are added to a branch while this
    // function is running, the original HEAD of that branch could otherwise be returned as
    // "unreferenced".
    long scanStartedAtInMicros = databaseAdapter.getConfig().currentTimeInMicros();

    IdentifyHeadsAndForkPoints identify =
        new IdentifyHeadsAndForkPoints(expectedCommitCount, scanStartedAtInMicros);

    // scanAllCommitLogEntries() returns all commits in no specific order, parents may be scanned
    // before or after their children.
    try (Stream scan = databaseAdapter.scanAllCommitLogEntries()) {
      scan.peek(commitHandler).forEach(identify::handleCommit);
    }

    return identify.finish();
  }

  /**
   * Identifies unreferenced heads and heads that are part of a named reference.
   *
   * Requires the output of {@link #identifyAllHeadsAndForkPoints(int, Consumer)}.
   */
  public ReferencedAndUnreferencedHeads identifyReferencedAndUnreferencedHeads(
      HeadsAndForkPoints headsAndForkPoints) throws ReferenceNotFoundException {
    Map> referenced = newOpenAddressingHashMap();
    Set heads = headsAndForkPoints.getHeads();
    Set unreferenced = newOpenAddressingHashSet(heads);

    long stopAtCommitTimeMicros =
        headsAndForkPoints.getScanStartedAtInMicros()
            - databaseAdapter.getConfig().getAssumedWallClockDriftMicros();

    try (Stream> namedRefs =
        databaseAdapter.namedRefs(GetNamedRefsParams.DEFAULT)) {
      namedRefs.forEach(
          refInfo -> {
            try (Stream logs = databaseAdapter.commitLog(refInfo.getHash())) {
              if (!referenced.containsKey(refInfo.getHash())) {
                // Only need to traverse the commit log from the same commit-ID once.
                for (Iterator logIter = logs.iterator(); logIter.hasNext(); ) {
                  CommitLogEntry entry = logIter.next();

                  Hash commitId = entry.getHash();

                  if (referenced.containsKey(commitId)) {
                    // Already saw this commit-ID, can break
                    break;
                  }

                  if (heads.contains(commitId)) {
                    unreferenced.remove(entry.getHash());
                  }

                  if (entry.getCreatedTime() < stopAtCommitTimeMicros) {
                    // Must scan up to the commit created right before
                    // identifyAllHeadsAndForkPoints() started to not accidentally return
                    // commits in 'unreferencedHeads' that were the HEAD of a commit that happened
                    // after identifyAllHeadsAndForkPoints() started.
                    break;
                  }
                }
              }

              // Add the named reference to the reachable HEADs.
              referenced
                  .computeIfAbsent(refInfo.getHash(), x -> newOpenAddressingHashSet())
                  .add(refInfo.getNamedRef());
            } catch (ReferenceNotFoundException e) {
              throw new RuntimeException(e);
            }
          });
    }

    return ReferencedAndUnreferencedHeads.of(referenced, unreferenced);
  }
}