com.google.cloud.hadoop.gcsio.GoogleCloudStorage Maven / Gradle / Ivy

Go to download
/**
 * Copyright 2013 Google Inc. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.google.cloud.hadoop.gcsio;

import java.io.IOException;
import java.nio.channels.WritableByteChannel;
import java.util.List;

/**
 * Interface for exposing the Google Cloud Storage API behavior in a way more amenable to writing
 * filesystem semantics on top of it, without having to deal with API-specific considerations such
 * as HttpTransports, credentials, network errors, batching, etc.
 * 
 * Please read the following document to get familiarity with basic GCS concepts:
 * https://developers.google.com/storage/docs/concepts-techniques
 */
public interface GoogleCloudStorage {
  // Pseudo path delimiter.
  //
  // GCS does not implement full concept of file system paths but it does expose
  // some notion of a delimiter that can be used with Storage.Objects.List to
  // control which items are listed.
  public static final String PATH_DELIMITER = "/";

  /**
   * Value indicating all objects should be returned from GCS, no limit.
   */
  public static final long MAX_RESULTS_UNLIMITED = -1;

  /**
   * Creates and opens an object for writing. The bucket must already exist.
   * If the object already exists, it is deleted.
   *
   * @param resourceId identifies a StorageObject
   * @return a channel for writing to the given object
   * @throws IOException on IO error
   */
  WritableByteChannel create(StorageResourceId resourceId)
      throws IOException;

  /**
   * Creates and opens an object for writing. The bucket must already exist.
   *
   * @param resourceId identifies a StorageObject
   * @param options Options to use when creating the object
   * @return a channel for writing to the given object
   * @throws IOException on IO error
   */
  WritableByteChannel create(StorageResourceId resourceId, CreateObjectOptions options)
      throws IOException;

  /**
   * Creates an empty object, useful for placeholders representing, for example, directories.
   * The bucket must already exist. If the object already exists, it is overwritten.
   *
   * @param resourceId identifies a StorageObject
   * @throws IOException on IO error
   */
  void createEmptyObject(StorageResourceId resourceId)
      throws IOException;

  /**
   * Creates an empty object, useful for placeholders representing, for example, directories.
   * The bucket must already exist. If the object already exists, it is overwritten.
   *
   * @param resourceId identifies a StorageObject
   * @param options options to use when creating the object
   * @throws IOException on IO error
   */
  void createEmptyObject(StorageResourceId resourceId, CreateObjectOptions options)
      throws IOException;

  /**
   * Creates a list of empty objects; see {@link #createEmptyObject(StorageResourceId)} for
   * the single-item version of this method. Implementations may use different flow than the
   * single-item version for greater efficiency.
   */
  void createEmptyObjects(List resourceIds)
      throws IOException;

  /**
   Creates a list of empty objects; see {@link #createEmptyObject(StorageResourceId)} for
   * the single-item version of this method. Implementations may use different flow than the
   * single-item version for greater efficiency.
   */
  void createEmptyObjects(List resourceIds, CreateObjectOptions options)
      throws IOException;

  /**
   * Opens an object for reading.
   *
   * @param resourceId identifies a StorageObject
   * @return a channel for reading from the given object
   * @throws FileNotFoundException if the given object does not exist
   * @throws IOException if object exists but cannot be opened
   */
  SeekableReadableByteChannel open(StorageResourceId resourceId)
      throws IOException;

  /**
   * Creates a bucket.
   *
   * @param bucketName name of the bucket to create
   * @throws IOException on IO error
   */
  void create(String bucketName)
      throws IOException;

  /**
   * Deletes a list of buckets. Does not throw any exception for "bucket not found" errors.
   *
   * @param bucketNames name of the buckets to delete
   * @throws FileNotFoundException if the given bucket does not exist
   * @throws IOException on IO error
   */
  void deleteBuckets(List bucketNames)
      throws IOException;

  /**
   * Deletes the given objects. Does not throw any exception for "object not found" errors.
   *
   * @param fullObjectNames names of objects to delete with their respective bucketNames.
   * @throws FileNotFoundException if the given object does not exist
   * @throws IOException if object exists but cannot be deleted
   */
  void deleteObjects(List fullObjectNames)
      throws IOException;

  /**
   * Copies metadata of the given objects. After the copy is successfully complete,
   * each object blob is reachable by two different names.
   * Copying between two different locations or between two different storage classes
   * is not allowed.
   *
   * @param srcBucketName name of the bucket containing the objects to copy
   * @param srcObjectNames names of the objects to copy
   * @param dstBucketName name of the bucket to copy to
   * @param dstObjectNames names of the objects after copy
   * @throws FileNotFoundException if the source object or the destination bucket does not exist
   * @throws IOException in all other error cases
   */
  void copy(String srcBucketName, List srcObjectNames,
      String dstBucketName, List dstObjectNames)
      throws IOException;

  /**
   * Gets a list of names of buckets in this project.
   */
  List listBucketNames()
      throws IOException;

  /**
   * Gets a list of GoogleCloudStorageItemInfo for all buckets of this project. This is no more
   * expensive than calling listBucketNames(), since the list API for buckets already retrieves
   * all the relevant bucket metadata.
   */
  List listBucketInfo()
      throws IOException;

  /**
   * Gets names of objects contained in the given bucket and whose names begin with
   * the given prefix.
   * 

   * Note:
   * Although GCS does not implement a file system, it treats objects that contain
   * a delimiter as different from other objects when listing objects.
   * This will be clearer with an example.
   * 

   * Consider a bucket with objects: o1, d1/, d1/o1, d1/o2
   * With prefix == null and delimiter == /,    we get: d1/, o1
   * With prefix == null and delimiter == null, we get: o1, d1/, d1/o1, d1/o2
   * 

   * Thus when delimiter is null, the entire key name is considered an opaque string,
   * otherwise only the part up to the first delimiter is considered.
   * 

   * The default implementation of this method should turn around and call
   * the version that takes {@code maxResults} so that inheriting classes
   * need only implement that version.
   *
   * @param bucketName bucket name
   * @param objectNamePrefix object name prefix or null if all objects in the bucket are desired
   * @param delimiter delimiter to use (typically "/"), otherwise null
   * @return list of object names
   * @throws IOException on IO error
   */
  List listObjectNames(
      String bucketName, String objectNamePrefix, String delimiter)
      throws IOException;

  /**
   * Gets names of objects contained in the given bucket and whose names begin with
   * the given prefix.
   * 

   * Note:
   * Although GCS does not implement a file system, it treats objects that contain
   * a delimiter as different from other objects when listing objects.
   * This will be clearer with an example.
   * 

   * Consider a bucket with objects: o1, d1/, d1/o1, d1/o2
   * With prefix == null and delimiter == /,    we get: d1/, o1
   * With prefix == null and delimiter == null, we get: o1, d1/, d1/o1, d1/o2
   * 

   * Thus when delimiter is null, the entire key name is considered an opaque string,
   * otherwise only the part up to the first delimiter is considered.
   *
   * @param bucketName bucket name
   * @param objectNamePrefix object name prefix or null if all objects in the bucket are desired
   * @param delimiter delimiter to use (typically "/"), otherwise null
   * @param maxResults maximum number of results to return,
   *        unlimited if negative or zero
   * @return list of object names
   * @throws IOException on IO error
   */
  List listObjectNames(
      String bucketName, String objectNamePrefix, String delimiter,
      long maxResults)
      throws IOException;

  /**
   * Same name-matching semantics as {@link listObjectNames} except this method
   * retrieves the full GoogleCloudStorageFileInfo for each item as well.
   * 

   * Generally the info is already available from
   * the same "list()" calls, so the only additional cost is dispatching an extra batch request to
   * retrieve object metadata for all listed *directories*, since these are originally listed as
   * String prefixes without attached metadata.
   * 

   * The default implementation of this method should turn around and call
   * the version that takes {@code maxResults} so that inheriting classes
   * need only implement that version.
   *
   * @param bucketName bucket name
   * @param objectNamePrefix object name prefix or null if all objects in the bucket are desired
   * @param delimiter delimiter to use (typically "/"), otherwise null
   * @return list of object info
   * @throws IOException on IO error
   */
  List listObjectInfo(
      final String bucketName, String objectNamePrefix, String delimiter)
      throws IOException;

  /**
   * Same name-matching semantics as {@link listObjectNames} except this method
   * retrieves the full GoogleCloudStorageFileInfo for each item as well.
   * 
   * Generally the info is already available from
   * the same "list()" calls, so the only additional cost is dispatching an extra batch request to
   * retrieve object metadata for all listed *directories*, since these are originally listed as
   * String prefixes without attached metadata.
   *
   * @param bucketName bucket name
   * @param objectNamePrefix object name prefix or null if all objects in the bucket are desired
   * @param delimiter delimiter to use (typically "/"), otherwise null
   * @param maxResults maximum number of results to return,
   *        unlimited if negative or zero
   * @return list of object info
   * @throws IOException on IO error
   */
  List listObjectInfo(
      final String bucketName, String objectNamePrefix, String delimiter,
      long maxResults)
      throws IOException;

  /**
   * Gets information about an object or a bucket.
   *
   * @param resourceId identifies either root, a Bucket, or a StorageObject
   * @return information about the given item
   * @throws IOException on IO error
   */
  GoogleCloudStorageItemInfo getItemInfo(StorageResourceId resourceId)
      throws IOException;

  /**
   * Gets information about multiple objects and/or buckets. Items which are "not found" will
   * still have an entry in the returned list; exists() will return false for these entries.
   *
   * @param resourceIds names of the GCS StorageObjects or Buckets for which to retrieve info.
   * @return information about the given resourceIds.
   * @throws IOException on IO error
   */
  List getItemInfos(List resourceIds)
      throws IOException;

  /**
   * Attempt to update metadata of the objects referenced within the passed itemInfo objects.
   * @return Updated GoogleCloudStorageItemInfo objects for the referenced objects.
   * @throws IOException on IO error
   */
  List updateItems(List itemInfoList)
      throws IOException;

  /**
   * Releases resources used by this instance.
   */
  void close();

  /**
   * Waits for the given bucket to be empty.
   *
   *
   * Note:
   * GCS only supports eventual consistency of object lists.
   * When a user deletes a top-level directory recursively,
   * the fact that all items have gone away is not reflected instantly.
   * We retry and wait for that to happen.
   */
  void waitForBucketEmpty(String bucketName)
      throws IOException;
}