com.google.cloud.hadoop.gcsio.CacheSupplementedGoogleCloudStorage Maven / Gradle / Ivy
/**
* Copyright 2013 Google Inc. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.cloud.hadoop.gcsio;
import com.google.cloud.hadoop.util.LogUtil;
import com.google.common.base.Preconditions;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.WritableByteChannel;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
/**
* CacheSupplementedGoogleCloudStorage adds additional book-keeping to a GoogleCloudStorage instance
* using a {@code DirectoryListCache} and wraps the create/copy/delete/list methods to provide
* immediate same-client consistency for "list" operations following a "create/copy/delete". See
* {@code DirectoryListCache} for details of consistency semantics.
*/
public class CacheSupplementedGoogleCloudStorage
implements GoogleCloudStorage {
// Logger.
private static final LogUtil log = new LogUtil(CacheSupplementedGoogleCloudStorage.class);
// An actual implementation of GoogleCloudStorage which will be used for the actual logic of
// GCS operations, while this class adds book-keeping around the delegated calls.
private final GoogleCloudStorage gcsDelegate;
// Cache of freshly created Buckets or StorageObjects to be updated on create/copy/delete to
// supplement "list" calls with GCS resources which may not have appeared in the Cloud list
// index yet.
// TODO(user): Add support for perf-boosting use-cases, such as serving getItemInfo directly
// from cache once we have plumbing in-place to pre-populate metadata on create/copy. Also,
// consider cases where it's possible to serve list* exclusively from cache as long as cross-
// client consistency isn't enforced.
private DirectoryListCache resourceCache;
/**
* Constructs a CacheSupplementedGoogleCloudStorage which should be usable anywhere a
* GoogleCloudStorage interface is used and which supplements missing listObject/listBucket
* results from an in-memory cache of known GCS resources which may not have propagated into
* the eventually-consistent remote "list" index yet.
*
* @param gcsDelegate The GoogleCloudStorage to be used for normal API interactions, before
* supplementing with in-memory info.
*/
public CacheSupplementedGoogleCloudStorage(
GoogleCloudStorage gcsDelegate, DirectoryListCache resourceCache) {
Preconditions.checkArgument(gcsDelegate != null, "gcsDelegate must not be null");
Preconditions.checkArgument(resourceCache != null, "resourceCache must not be null");
this.gcsDelegate = gcsDelegate;
this.resourceCache = resourceCache;
}
/**
* Wraps the delegate's returned WritableByteChannel in a helper which will update the
* resourceCache when close() is called.
*/
@Override
public WritableByteChannel create(final StorageResourceId resourceId)
throws IOException {
log.debug("create(%s)", resourceId);
return create(resourceId, CreateObjectOptions.DEFAULT);
}
@Override
public WritableByteChannel create(final StorageResourceId resourceId, CreateObjectOptions options)
throws IOException {
log.debug("create(%s, %s)", resourceId, options);
final WritableByteChannel innerChannel = gcsDelegate.create(resourceId, options);
// Wrap the delegate's channel in our own channel which simply adds the additional book-keeping
// hook to close().
return new WritableByteChannel() {
@Override
public int write(ByteBuffer buffer)
throws IOException {
return innerChannel.write(buffer);
}
@Override
public boolean isOpen() {
return innerChannel.isOpen();
}
@Override
public void close()
throws IOException {
innerChannel.close();
// TODO(user): Make create() somehow wire the StorageObject through to the caller,
// possibly through an onClose() handler so that we can pre-emptively populate the
// metadata in the CacheEntry.
resourceCache.putResourceId(resourceId);
}
};
}
/**
* Records the resourceId after delegating.
*/
@Override
public void createEmptyObject(StorageResourceId resourceId)
throws IOException {
log.debug("createEmptyObject(%s)", resourceId);
gcsDelegate.createEmptyObject(resourceId);
resourceCache.putResourceId(resourceId);
}
@Override
public void createEmptyObject(StorageResourceId resourceId, CreateObjectOptions options)
throws IOException {
log.debug("createEmptyObject(%s, %s)", resourceId, options);
gcsDelegate.createEmptyObject(resourceId, options);
resourceCache.putResourceId(resourceId);
}
/**
* Records the resourceIds after delegating.
*/
@Override
public void createEmptyObjects(List resourceIds)
throws IOException {
log.debug("createEmptyObjects(%s)", resourceIds);
gcsDelegate.createEmptyObjects(resourceIds);
for (StorageResourceId resourceId : resourceIds) {
resourceCache.putResourceId(resourceId);
}
}
@Override
public void createEmptyObjects(List resourceIds, CreateObjectOptions options)
throws IOException {
log.debug("createEmptyObjects(%s, %s)", resourceIds, options);
gcsDelegate.createEmptyObjects(resourceIds, options);
for (StorageResourceId resourceId : resourceIds) {
resourceCache.putResourceId(resourceId);
}
}
/**
* Pure pass-through.
*/
@Override
public SeekableReadableByteChannel open(StorageResourceId resourceId)
throws IOException {
log.debug("open(%s)", resourceId);
return gcsDelegate.open(resourceId);
}
/**
* Updates cache with bucketName.
*/
@Override
public void create(String bucketName)
throws IOException {
log.debug("create(%s)", bucketName);
// TODO(user): Make create() return the Bucket so that we can pre-emptively populate the
// metadata in the CachedBucket.
gcsDelegate.create(bucketName);
resourceCache.putResourceId(new StorageResourceId(bucketName));
}
/**
* Removes buckets from cache, if they exist.
*/
@Override
public void deleteBuckets(List bucketNames)
throws IOException {
log.debug("deleteBuckets(%s)", bucketNames);
// TODO(user): Potentially include as blacklist entry in cache along with timestamp to clobber
// incorrect/stale "list" results from GCS as long as their returned timestamp is older than
// the blacklist entry.
gcsDelegate.deleteBuckets(bucketNames);
for (String bucketName : bucketNames) {
resourceCache.removeResourceId(new StorageResourceId(bucketName));
}
}
/**
* Removes objects from cache, if they exist.
*/
@Override
public void deleteObjects(List fullObjectNames)
throws IOException {
log.debug("deleteObjects(%s)", fullObjectNames);
// TODO(user): Potentially include as blacklist entry in cache along with timestamp to clobber
// incorrect/stale "list" results from GCS as long as their returned timestamp is older than
// the blacklist entry.
gcsDelegate.deleteObjects(fullObjectNames);
for (StorageResourceId resourceId : fullObjectNames) {
resourceCache.removeResourceId(resourceId);
}
}
/**
* Adds the copied destination items to the list cache, without their associated metadata;
* supplementing with the cache will have to populate the metadata on-demand.
*/
@Override
public void copy(String srcBucketName, List srcObjectNames,
String dstBucketName, List dstObjectNames)
throws IOException {
// TODO(user): Maybe catch exceptions and check their inner exceptions for
// FileNotFoundExceptions and update the DirectoryListCache accordingly. For partial failures,
// we probably still want to add the successful ones to the list cache.
// TODO(user): Make GCS.copy return the list of destination StorageObjects which were
// successfully created, so that we can pre-emptively populate the metadata into the cache.
gcsDelegate.copy(srcBucketName, srcObjectNames, dstBucketName, dstObjectNames);
for (String dstObjectName : dstObjectNames) {
resourceCache.putResourceId(new StorageResourceId(dstBucketName, dstObjectName));
}
}
/**
* Helper for checking the list of {@code candidateEntries} against a {@code originalIds} to
* possibly retrieve supplemental results from the DirectoryListCache.
* This method will modify {@code originalIds} as it goes to include the StorageResourceIds
* of CacheEntrys being returned.
*
* @return A list of CacheEntry which is a subset of {@code candidateEntries}, whose elements
* are not in the set of resourceIds corresponding to {@code originalIds}.
*/
private List getSupplementalEntries(
Set originalIds, List candidateEntries) {
List supplementalEntries = new ArrayList<>();
for (CacheEntry entry : candidateEntries) {
StorageResourceId entryId = entry.getResourceId();
if (!originalIds.contains(entryId)) {
supplementalEntries.add(entry);
originalIds.add(entryId);
}
}
return supplementalEntries;
}
/**
* Helper for either pulling the existing GoogleCloudStorageItemInfo from each element of
* {@code cacheEntries} or fetching the associated GoogleCloudStorageItemInfo on-demand, updating
* the cache entry, then appending the new result to the return list. Items which fail to be
* fetched will not be returned.
*/
private List extractItemInfos(List cacheEntries)
throws IOException {
// TODO(user): Batch these.
List supplementalInfos = new ArrayList<>();
for (CacheEntry entry : cacheEntries) {
GoogleCloudStorageItemInfo itemInfo = entry.getItemInfo();
if (itemInfo != null) {
// The detailed info is already available; supplement it directly.
log.info("Supplementing missing itemInfo with already-cached info: %s", itemInfo);
supplementalInfos.add(itemInfo);
} else {
// We need to fetch the associated info from the gcsDelegate; in addition to
// supplementing, we must update the cache with the fetched info.
log.info("Populating missing itemInfo on-demand for entry: %s", entry.getResourceId());
itemInfo = gcsDelegate.getItemInfo(entry.getResourceId());
if (!itemInfo.exists()) {
// TODO(user): Change to info.toString() after adding a good toString().
// TODO(user): Update the cache by removing it.
log.error("Failed to fetch item info for a CacheEntry: %s", entry.getResourceId());
} else {
entry.setItemInfo(itemInfo);
supplementalInfos.add(itemInfo);
}
}
}
return supplementalInfos;
}
/**
* Supplements the list returned by the delegate with cached bucket names; won't trigger
* any fetching of metadata.
*/
@Override
public List listBucketNames()
throws IOException {
log.debug("listBucketNames()");
List allBucketNames = gcsDelegate.listBucketNames();
List cachedBuckets = resourceCache.getBucketList();
if (cachedBuckets.isEmpty()) {
return allBucketNames;
} else {
// Make a copy in case the delegate returned an immutable list.
allBucketNames = new ArrayList<>(allBucketNames);
}
Set bucketIds = new HashSet<>();
for (String bucketName : allBucketNames) {
bucketIds.add(new StorageResourceId(bucketName));
}
List missingCachedBuckets = getSupplementalEntries(bucketIds, cachedBuckets);
for (CacheEntry supplement : missingCachedBuckets) {
log.info("Supplementing missing matched StorageResourceId: %s", supplement.getResourceId());
allBucketNames.add(supplement.getResourceId().getBucketName());
}
return allBucketNames;
}
/**
* Supplements the list returned by the delegate with cached bucket infos; may trigger fetching
* of any metadata not already available in the cache. If a delegate-returned item is also in the
* cache and the cache doesn't already have the metadata, it will be opportunistically updated
* with the retrieved metadata.
*/
@Override
public List listBucketInfo()
throws IOException {
log.debug("listBucketInfo()");
List allBucketInfos = gcsDelegate.listBucketInfo();
List cachedBuckets = resourceCache.getBucketList();
if (cachedBuckets.isEmpty()) {
return allBucketInfos;
} else {
// Make a copy in case the delegate returned an immutable list.
allBucketInfos = new ArrayList<>(allBucketInfos);
}
Set bucketIdsSet = new HashSet<>();
for (GoogleCloudStorageItemInfo itemInfo : allBucketInfos) {
bucketIdsSet.add(itemInfo.getResourceId());
}
List missingCachedBuckets = getSupplementalEntries(bucketIdsSet, cachedBuckets);
List supplementalInfos = extractItemInfos(missingCachedBuckets);
allBucketInfos.addAll(supplementalInfos);
return allBucketInfos;
}
/**
* Supplements the list returned by the delegate with cached object names; won't trigger
* any fetching of metadata.
*/
@Override
public List listObjectNames(
String bucketName, String objectNamePrefix, String delimiter)
throws IOException {
return listObjectNames(bucketName, objectNamePrefix, delimiter,
GoogleCloudStorage.MAX_RESULTS_UNLIMITED);
}
/**
* Supplements the list returned by the delegate with cached object names; won't trigger
* any fetching of metadata.
*/
@Override
public List listObjectNames(
String bucketName, String objectNamePrefix, String delimiter,
long maxResults)
throws IOException {
log.debug("listObjectNames(%s, %s, %s, %d)", bucketName, objectNamePrefix,
delimiter, maxResults);
List allObjectNames = gcsDelegate.listObjectNames(
bucketName, objectNamePrefix, delimiter, maxResults);
if (maxResults > 0 && allObjectNames.size() >= maxResults) {
// Should not have allObjectNames.size() > maxResults, since we
// passed maxResults to delegate.
return allObjectNames;
}
// We pass 'null' for 'prefixes' because for now, we won't try to supplement match "prefixes";
// in normal operation, the cache will also contain the "parent directory" objects for each
// file, so they would be supplemented as exact matches anyway (if we have gs://bucket/foo/ and
// gs://bucket/foo/bar, we won't need gs://bucket/foo/bar to generate the "prefix match"
// gs://bucket/foo/, since the exact directory object already exists).
// The only exception is if a *different* client created the directory object, so that
// the local client created the file without creating the directory objects, and then
// the list API fails to list either object. This is a case of cross-client inconsistency
// not solved by this cache.
List cachedObjects = resourceCache.getObjectList(
bucketName, objectNamePrefix, delimiter, null);
if (cachedObjects == null || cachedObjects.isEmpty()) {
return allObjectNames;
} else {
// Make a copy in case the delegate returned an immutable list.
allObjectNames = new ArrayList<>(allObjectNames);
}
Set objectIds = new HashSet<>();
for (String objectName : allObjectNames) {
objectIds.add(new StorageResourceId(bucketName, objectName));
}
List missingCachedObjects = getSupplementalEntries(objectIds, cachedObjects);
for (CacheEntry supplement : missingCachedObjects) {
log.info("Supplementing missing matched StorageResourceId: %s", supplement.getResourceId());
allObjectNames.add(supplement.getResourceId().getObjectName());
if (maxResults > 0 && allObjectNames.size() >= maxResults) {
return allObjectNames;
}
}
return allObjectNames;
}
/**
* Supplements the list returned by the delegate with cached object infos; may trigger fetching
* of any metadata not already available in the cache. If a delegate-returned item is also in the
* cache and the cache doesn't already have the metadata, it will be opportunistically updated
* with the retrieved metadata.
*/
@Override
public List listObjectInfo(
String bucketName, String objectNamePrefix, String delimiter)
throws IOException {
return listObjectInfo(bucketName, objectNamePrefix, delimiter,
GoogleCloudStorage.MAX_RESULTS_UNLIMITED);
}
/**
* Supplements the list returned by the delegate with cached object infos; may trigger fetching
* of any metadata not already available in the cache. If a delegate-returned item is also in the
* cache and the cache doesn't already have the metadata, it will be opportunistically updated
* with the retrieved metadata.
*/
@Override
public List listObjectInfo(
String bucketName, String objectNamePrefix, String delimiter,
long maxResults)
throws IOException {
log.debug("listObjectInfo(%s, %s, %s, %d)", bucketName, objectNamePrefix,
delimiter, maxResults);
List allObjectInfos =
gcsDelegate.listObjectInfo(bucketName, objectNamePrefix, delimiter,
maxResults);
if (maxResults > 0 && allObjectInfos.size() >= maxResults) {
return allObjectInfos;
}
List cachedObjects = resourceCache.getObjectList(
bucketName, objectNamePrefix, delimiter, null);
if (cachedObjects == null || cachedObjects.isEmpty()) {
return allObjectInfos;
} else {
// Make a copy in case the delegate returned an immutable list.
allObjectInfos = new ArrayList<>(allObjectInfos);
}
// TODO(user): Refactor out more of the shared logic between the 4 list* methods.
Set objectIdsSet = new HashSet<>();
for (GoogleCloudStorageItemInfo itemInfo : allObjectInfos) {
objectIdsSet.add(itemInfo.getResourceId());
}
List missingCachedObjects = getSupplementalEntries(objectIdsSet, cachedObjects);
List supplementalInfos = extractItemInfos(missingCachedObjects);
if (maxResults <= 0
|| allObjectInfos.size() + supplementalInfos.size() <= maxResults) {
allObjectInfos.addAll(supplementalInfos);
} else {
for (GoogleCloudStorageItemInfo item : supplementalInfos) {
allObjectInfos.add(item);
if (allObjectInfos.size() >= maxResults) {
break;
}
}
}
return allObjectInfos;
}
/**
* Pure pass-through.
*/
@Override
public List getItemInfos(List resourceIds)
throws IOException {
log.debug("getItemInfos(%s)", resourceIds.toString());
return gcsDelegate.getItemInfos(resourceIds);
}
@Override
public List updateItems(List itemInfoList)
throws IOException {
log.debug("updateItems(%s)", itemInfoList);
return gcsDelegate.updateItems(itemInfoList);
}
/**
* Pure pass-through.
*/
@Override
public GoogleCloudStorageItemInfo getItemInfo(StorageResourceId resourceId)
throws IOException {
log.debug("getItemInfo(%s)", resourceId);
// TODO(user): Maybe opportunistically update the cache with any retrieved info; it would take
// more memory but potentially improve cache coherence. Here and in getItemInfos.
return gcsDelegate.getItemInfo(resourceId);
}
/**
* Pure pass-through.
*/
@Override
public void close() {
gcsDelegate.close();
}
/**
* Pure pass-through.
*/
@Override
public void waitForBucketEmpty(String bucketName)
throws IOException {
gcsDelegate.waitForBucketEmpty(bucketName);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy