All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.cloud.hadoop.gcsio.CacheSupplementedGoogleCloudStorage Maven / Gradle / Ivy

/**
 * Copyright 2013 Google Inc. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.google.cloud.hadoop.gcsio;

import com.google.cloud.hadoop.util.LogUtil;
import com.google.common.base.Preconditions;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.WritableByteChannel;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

/**
 * CacheSupplementedGoogleCloudStorage adds additional book-keeping to a GoogleCloudStorage instance
 * using a {@code DirectoryListCache} and wraps the create/copy/delete/list methods to provide
 * immediate same-client consistency for "list" operations following a "create/copy/delete". See
 * {@code DirectoryListCache} for details of consistency semantics.
 */
public class CacheSupplementedGoogleCloudStorage
    implements GoogleCloudStorage {
  // Logger.
  private static final LogUtil log = new LogUtil(CacheSupplementedGoogleCloudStorage.class);

  // An actual implementation of GoogleCloudStorage which will be used for the actual logic of
  // GCS operations, while this class adds book-keeping around the delegated calls.
  private final GoogleCloudStorage gcsDelegate;

  // Cache of freshly created Buckets or StorageObjects to be updated on create/copy/delete to
  // supplement "list" calls with GCS resources which may not have appeared in the Cloud list
  // index yet.
  // TODO(user): Add support for perf-boosting use-cases, such as serving getItemInfo directly
  // from cache once we have plumbing in-place to pre-populate metadata on create/copy. Also,
  // consider cases where it's possible to serve list* exclusively from cache as long as cross-
  // client consistency isn't enforced.
  private DirectoryListCache resourceCache;

  /**
   * Constructs a CacheSupplementedGoogleCloudStorage which should be usable anywhere a
   * GoogleCloudStorage interface is used and which supplements missing listObject/listBucket
   * results from an in-memory cache of known GCS resources which may not have propagated into
   * the eventually-consistent remote "list" index yet.
   *
   * @param gcsDelegate The GoogleCloudStorage to be used for normal API interactions, before
   *     supplementing with in-memory info.
   */
  public CacheSupplementedGoogleCloudStorage(
      GoogleCloudStorage gcsDelegate, DirectoryListCache resourceCache) {
    Preconditions.checkArgument(gcsDelegate != null, "gcsDelegate must not be null");
    Preconditions.checkArgument(resourceCache != null, "resourceCache must not be null");

    this.gcsDelegate = gcsDelegate;
    this.resourceCache = resourceCache;
  }

  /**
   * Wraps the delegate's returned WritableByteChannel in a helper which will update the
   * resourceCache when close() is called.
   */
  @Override
  public WritableByteChannel create(final StorageResourceId resourceId)
      throws IOException {
    log.debug("create(%s)", resourceId);
    return create(resourceId, CreateObjectOptions.DEFAULT);
  }

  @Override
  public WritableByteChannel create(final StorageResourceId resourceId, CreateObjectOptions options)
      throws IOException {
    log.debug("create(%s, %s)", resourceId, options);

    final WritableByteChannel innerChannel = gcsDelegate.create(resourceId, options);

    // Wrap the delegate's channel in our own channel which simply adds the additional book-keeping
    // hook to close().
    return new WritableByteChannel() {
      @Override
      public int write(ByteBuffer buffer)
          throws IOException {
        return innerChannel.write(buffer);
      }

      @Override
      public boolean isOpen() {
        return innerChannel.isOpen();
      }

      @Override
      public void close()
          throws IOException {
        innerChannel.close();
        // TODO(user): Make create() somehow wire the StorageObject through to the caller,
        // possibly through an onClose() handler so that we can pre-emptively populate the
        // metadata in the CacheEntry.
        resourceCache.putResourceId(resourceId);
      }
    };
  }

  /**
   * Records the resourceId after delegating.
   */
  @Override
  public void createEmptyObject(StorageResourceId resourceId)
      throws IOException {
    log.debug("createEmptyObject(%s)", resourceId);
    gcsDelegate.createEmptyObject(resourceId);
    resourceCache.putResourceId(resourceId);
  }

  @Override
  public void createEmptyObject(StorageResourceId resourceId, CreateObjectOptions options)
      throws IOException {
    log.debug("createEmptyObject(%s, %s)", resourceId, options);
    gcsDelegate.createEmptyObject(resourceId, options);
    resourceCache.putResourceId(resourceId);
  }

  /**
   * Records the resourceIds after delegating.
   */
  @Override
  public void createEmptyObjects(List resourceIds)
      throws IOException {
    log.debug("createEmptyObjects(%s)", resourceIds);
    gcsDelegate.createEmptyObjects(resourceIds);
    for (StorageResourceId resourceId : resourceIds) {
      resourceCache.putResourceId(resourceId);
    }
  }

  @Override
  public void createEmptyObjects(List resourceIds, CreateObjectOptions options)
      throws IOException {
    log.debug("createEmptyObjects(%s, %s)", resourceIds, options);
    gcsDelegate.createEmptyObjects(resourceIds, options);
    for (StorageResourceId resourceId : resourceIds) {
      resourceCache.putResourceId(resourceId);
    }
  }

  /**
   * Pure pass-through.
   */
  @Override
  public SeekableReadableByteChannel open(StorageResourceId resourceId)
      throws IOException {
    log.debug("open(%s)", resourceId);
    return gcsDelegate.open(resourceId);
  }

  /**
   * Updates cache with bucketName.
   */
  @Override
  public void create(String bucketName)
      throws IOException {
    log.debug("create(%s)", bucketName);
    // TODO(user): Make create() return the Bucket so that we can pre-emptively populate the
    // metadata in the CachedBucket.
    gcsDelegate.create(bucketName);
    resourceCache.putResourceId(new StorageResourceId(bucketName));
  }

  /**
   * Removes buckets from cache, if they exist.
   */
  @Override
  public void deleteBuckets(List bucketNames)
      throws IOException {
    log.debug("deleteBuckets(%s)", bucketNames);
    // TODO(user): Potentially include as blacklist entry in cache along with timestamp to clobber
    // incorrect/stale "list" results from GCS as long as their returned timestamp is older than
    // the blacklist entry.
    gcsDelegate.deleteBuckets(bucketNames);
    for (String bucketName : bucketNames) {
      resourceCache.removeResourceId(new StorageResourceId(bucketName));
    }
  }

  /**
   * Removes objects from cache, if they exist.
   */
  @Override
  public void deleteObjects(List fullObjectNames)
      throws IOException {
    log.debug("deleteObjects(%s)", fullObjectNames);
    // TODO(user): Potentially include as blacklist entry in cache along with timestamp to clobber
    // incorrect/stale "list" results from GCS as long as their returned timestamp is older than
    // the blacklist entry.
    gcsDelegate.deleteObjects(fullObjectNames);
    for (StorageResourceId resourceId : fullObjectNames) {
      resourceCache.removeResourceId(resourceId);
    }
  }

  /**
   * Adds the copied destination items to the list cache, without their associated metadata;
   * supplementing with the cache will have to populate the metadata on-demand.
   */
  @Override
  public void copy(String srcBucketName, List srcObjectNames,
      String dstBucketName, List dstObjectNames)
      throws IOException {
    // TODO(user): Maybe catch exceptions and check their inner exceptions for
    // FileNotFoundExceptions and update the DirectoryListCache accordingly. For partial failures,
    // we probably still want to add the successful ones to the list cache.
    // TODO(user): Make GCS.copy return the list of destination StorageObjects which were
    // successfully created, so that we can pre-emptively populate the metadata into the cache.
    gcsDelegate.copy(srcBucketName, srcObjectNames, dstBucketName, dstObjectNames);
    for (String dstObjectName : dstObjectNames) {
      resourceCache.putResourceId(new StorageResourceId(dstBucketName, dstObjectName));
    }
  }

  /**
   * Helper for checking the list of {@code candidateEntries} against a {@code originalIds} to
   * possibly retrieve supplemental results from the DirectoryListCache.
   * This method will modify {@code originalIds} as it goes to include the StorageResourceIds
   * of CacheEntrys being returned.
   *
   * @return A list of CacheEntry which is a subset of {@code candidateEntries}, whose elements
   *     are not in the set of resourceIds corresponding to {@code originalIds}.
   */
  private List getSupplementalEntries(
      Set originalIds, List candidateEntries) {
    List supplementalEntries = new ArrayList<>();
    for (CacheEntry entry : candidateEntries) {
      StorageResourceId entryId = entry.getResourceId();
      if (!originalIds.contains(entryId)) {
        supplementalEntries.add(entry);
        originalIds.add(entryId);
      }
    }
    return supplementalEntries;
  }

  /**
   * Helper for either pulling the existing GoogleCloudStorageItemInfo from each element of
   * {@code cacheEntries} or fetching the associated GoogleCloudStorageItemInfo on-demand, updating
   * the cache entry, then appending the new result to the return list. Items which fail to be
   * fetched will not be returned.
   */
  private List extractItemInfos(List cacheEntries)
      throws IOException {
    // TODO(user): Batch these.
    List supplementalInfos = new ArrayList<>();
    for (CacheEntry entry : cacheEntries) {
      GoogleCloudStorageItemInfo itemInfo = entry.getItemInfo();
      if (itemInfo != null) {
        // The detailed info is already available; supplement it directly.
        log.info("Supplementing missing itemInfo with already-cached info: %s", itemInfo);
        supplementalInfos.add(itemInfo);
      } else {
        // We need to fetch the associated info from the gcsDelegate; in addition to
        // supplementing, we must update the cache with the fetched info.
        log.info("Populating missing itemInfo on-demand for entry: %s", entry.getResourceId());
        itemInfo = gcsDelegate.getItemInfo(entry.getResourceId());
        if (!itemInfo.exists()) {
          // TODO(user): Change to info.toString() after adding a good toString().
          // TODO(user): Update the cache by removing it.
          log.error("Failed to fetch item info for a CacheEntry: %s", entry.getResourceId());
        } else {
          entry.setItemInfo(itemInfo);
          supplementalInfos.add(itemInfo);
        }
      }
    }
    return supplementalInfos;
  }

  /**
   * Supplements the list returned by the delegate with cached bucket names; won't trigger
   * any fetching of metadata.
   */
  @Override
  public List listBucketNames()
      throws IOException {
    log.debug("listBucketNames()");
    List allBucketNames = gcsDelegate.listBucketNames();
    List cachedBuckets = resourceCache.getBucketList();
    if (cachedBuckets.isEmpty()) {
      return allBucketNames;
    } else {
      // Make a copy in case the delegate returned an immutable list.
      allBucketNames = new ArrayList<>(allBucketNames);
    }

    Set bucketIds = new HashSet<>();
    for (String bucketName : allBucketNames) {
      bucketIds.add(new StorageResourceId(bucketName));
    }

    List missingCachedBuckets = getSupplementalEntries(bucketIds, cachedBuckets);
    for (CacheEntry supplement : missingCachedBuckets) {
      log.info("Supplementing missing matched StorageResourceId: %s", supplement.getResourceId());
      allBucketNames.add(supplement.getResourceId().getBucketName());
    }
    return allBucketNames;
  }

  /**
   * Supplements the list returned by the delegate with cached bucket infos; may trigger fetching
   * of any metadata not already available in the cache. If a delegate-returned item is also in the
   * cache and the cache doesn't already have the metadata, it will be opportunistically updated
   * with the retrieved metadata.
   */
  @Override
  public List listBucketInfo()
      throws IOException {
    log.debug("listBucketInfo()");
    List allBucketInfos = gcsDelegate.listBucketInfo();
    List cachedBuckets = resourceCache.getBucketList();
    if (cachedBuckets.isEmpty()) {
      return allBucketInfos;
    } else {
      // Make a copy in case the delegate returned an immutable list.
      allBucketInfos = new ArrayList<>(allBucketInfos);
    }


    Set bucketIdsSet = new HashSet<>();
    for (GoogleCloudStorageItemInfo itemInfo : allBucketInfos) {
      bucketIdsSet.add(itemInfo.getResourceId());
    }
    List missingCachedBuckets = getSupplementalEntries(bucketIdsSet, cachedBuckets);
    List supplementalInfos = extractItemInfos(missingCachedBuckets);

    allBucketInfos.addAll(supplementalInfos);
    return allBucketInfos;
  }

  /**
   * Supplements the list returned by the delegate with cached object names; won't trigger
   * any fetching of metadata.
   */
  @Override
  public List listObjectNames(
      String bucketName, String objectNamePrefix, String delimiter)
      throws IOException {
    return listObjectNames(bucketName, objectNamePrefix, delimiter,
        GoogleCloudStorage.MAX_RESULTS_UNLIMITED);
  }

  /**
   * Supplements the list returned by the delegate with cached object names; won't trigger
   * any fetching of metadata.
   */
  @Override
  public List listObjectNames(
      String bucketName, String objectNamePrefix, String delimiter,
      long maxResults)
      throws IOException {
    log.debug("listObjectNames(%s, %s, %s, %d)", bucketName, objectNamePrefix,
        delimiter, maxResults);
    List allObjectNames = gcsDelegate.listObjectNames(
        bucketName, objectNamePrefix, delimiter, maxResults);

    if (maxResults > 0 && allObjectNames.size() >= maxResults) {
      // Should not have allObjectNames.size() > maxResults, since we
      // passed maxResults to delegate.
      return allObjectNames;
    }

    // We pass 'null' for 'prefixes' because for now, we won't try to supplement match "prefixes";
    // in normal operation, the cache will also contain the "parent directory" objects for each
    // file, so they would be supplemented as exact matches anyway (if we have gs://bucket/foo/ and
    // gs://bucket/foo/bar, we won't need gs://bucket/foo/bar to generate the "prefix match"
    // gs://bucket/foo/, since the exact directory object already exists).
    // The only exception is if a *different* client created the directory object, so that
    // the local client created the file without creating the directory objects, and then
    // the list API fails to list either object. This is a case of cross-client inconsistency
    // not solved by this cache.
    List cachedObjects = resourceCache.getObjectList(
        bucketName, objectNamePrefix, delimiter, null);
    if (cachedObjects == null || cachedObjects.isEmpty()) {
      return allObjectNames;
    } else {
      // Make a copy in case the delegate returned an immutable list.
      allObjectNames = new ArrayList<>(allObjectNames);
    }

    Set objectIds = new HashSet<>();
    for (String objectName : allObjectNames) {
      objectIds.add(new StorageResourceId(bucketName, objectName));
    }

    List missingCachedObjects = getSupplementalEntries(objectIds, cachedObjects);
    for (CacheEntry supplement : missingCachedObjects) {
      log.info("Supplementing missing matched StorageResourceId: %s", supplement.getResourceId());
      allObjectNames.add(supplement.getResourceId().getObjectName());
      if (maxResults > 0 && allObjectNames.size() >= maxResults) {
        return allObjectNames;
      }
    }
    return allObjectNames;
  }

  /**
   * Supplements the list returned by the delegate with cached object infos; may trigger fetching
   * of any metadata not already available in the cache. If a delegate-returned item is also in the
   * cache and the cache doesn't already have the metadata, it will be opportunistically updated
   * with the retrieved metadata.
   */
  @Override
  public List listObjectInfo(
      String bucketName, String objectNamePrefix, String delimiter)
      throws IOException {
    return listObjectInfo(bucketName, objectNamePrefix, delimiter,
        GoogleCloudStorage.MAX_RESULTS_UNLIMITED);
  }

  /**
   * Supplements the list returned by the delegate with cached object infos; may trigger fetching
   * of any metadata not already available in the cache. If a delegate-returned item is also in the
   * cache and the cache doesn't already have the metadata, it will be opportunistically updated
   * with the retrieved metadata.
   */
  @Override
  public List listObjectInfo(
      String bucketName, String objectNamePrefix, String delimiter,
      long maxResults)
      throws IOException {
    log.debug("listObjectInfo(%s, %s, %s, %d)", bucketName, objectNamePrefix,
        delimiter, maxResults);
    List allObjectInfos =
        gcsDelegate.listObjectInfo(bucketName, objectNamePrefix, delimiter,
            maxResults);
    if (maxResults > 0 && allObjectInfos.size() >= maxResults) {
        return allObjectInfos;
    }
    List cachedObjects = resourceCache.getObjectList(
        bucketName, objectNamePrefix, delimiter, null);
    if (cachedObjects == null || cachedObjects.isEmpty()) {
      return allObjectInfos;
    } else {
      // Make a copy in case the delegate returned an immutable list.
      allObjectInfos = new ArrayList<>(allObjectInfos);
    }

    // TODO(user): Refactor out more of the shared logic between the 4 list* methods.
    Set objectIdsSet = new HashSet<>();
    for (GoogleCloudStorageItemInfo itemInfo : allObjectInfos) {
      objectIdsSet.add(itemInfo.getResourceId());
    }

    List missingCachedObjects = getSupplementalEntries(objectIdsSet, cachedObjects);
    List supplementalInfos = extractItemInfos(missingCachedObjects);

    if (maxResults <= 0
        || allObjectInfos.size() + supplementalInfos.size() <= maxResults) {
      allObjectInfos.addAll(supplementalInfos);
    } else {
      for (GoogleCloudStorageItemInfo item : supplementalInfos) {
        allObjectInfos.add(item);
        if (allObjectInfos.size() >= maxResults) {
          break;
        }
      }
    }

    return allObjectInfos;
  }

  /**
   * Pure pass-through.
   */
  @Override
  public List getItemInfos(List resourceIds)
      throws IOException {
    log.debug("getItemInfos(%s)", resourceIds.toString());
    return gcsDelegate.getItemInfos(resourceIds);
  }

  @Override
  public List updateItems(List itemInfoList)
      throws IOException {
    log.debug("updateItems(%s)", itemInfoList);
    return gcsDelegate.updateItems(itemInfoList);
  }

  /**
   * Pure pass-through.
   */
  @Override
  public GoogleCloudStorageItemInfo getItemInfo(StorageResourceId resourceId)
      throws IOException {
    log.debug("getItemInfo(%s)", resourceId);
    // TODO(user): Maybe opportunistically update the cache with any retrieved info; it would take
    // more memory but potentially improve cache coherence. Here and in getItemInfos.
    return gcsDelegate.getItemInfo(resourceId);
  }

  /**
   * Pure pass-through.
   */
  @Override
  public void close() {
    gcsDelegate.close();
  }

  /**
   * Pure pass-through.
   */
  @Override
  public void waitForBucketEmpty(String bucketName)
      throws IOException {
    gcsDelegate.waitForBucketEmpty(bucketName);
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy