org.kitesdk.data.Datasets Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of kite-data-core Show documentation
The Kite Data Core module provides simple, intuitive APIs for working with datasets in the Hadoop Platform.
There is a newer version: 1.1.0
/*
 * Copyright 2013 Cloudera Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.kitesdk.data;

import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import java.net.URI;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.kitesdk.data.spi.DatasetRepository;
import org.kitesdk.data.spi.AbstractDataset;
import org.kitesdk.data.spi.Constraints;
import org.kitesdk.data.spi.Pair;
import org.kitesdk.data.spi.Registration;

/**
 * Methods for working with {@link Dataset} instances.
 * 
 * URIs
 * 

 * All methods require a URI that identifies a dataset, view, or
 * repository. The URI must begin with the scheme {@code dataset:}, 
 * {@code view:}, or {@code repo:}. The remainder of the URI is 
 * implementation specific, depending on the dataset scheme.
 * 

 * For example, the URI {@code dataset:hive:movies/ratings}
 * references a dataset named ratings in the
 * movies namespace, stored in Hive.
 * 

 * The URI {@code view:hive:movies/ratings?year=2015&month=3}
 * references a view of the same ratings dataset. The view
 * is filtered to include records from only March, 2015.
 * 

 * See Dataset and View
 * URIs for the available URI patterns.
 * 

 * Dataset Descriptors
 * 

 * Some methods require a {@link DatasetDescriptor} that encapsulates metadata
 * about a dataset. Descriptors are built using a
 * {@link DatasetDescriptor.Builder descriptor builder}.
 * 

 * Entities
 * 

 * Entities are analagous to records in database terminology.
 * The term is used in the API to emphasize that an entity can include not
 * only primitive objects, but also complex objects such as hash maps.
 * 

 * Some methods accept an entity class that will be used by Kite when returning
 * entities from a dataset or view.
 *
 * @since 0.8.0
 */
public class Datasets {
  /**
   * Load a {@link Dataset} or {@link View} for the given {@link URI}.
   * 

   * URIs must begin with {@code dataset:} or {@code view:}. The remainder of
   * the URI is implementation specific, depending on the dataset scheme.
   * 

   * If you use a dataset URI, {@code load} returns the unfiltered dataset.
   * If you use a view URI, {@code load} returns a {@code View} configured to
   * read a subset of the dataset.
   *
   * @param uri a {@code Dataset} or {@code View} URI
   * @param type a Java class that represents an entity in the dataset
   * @param  the type used for readers and writers created by this
   *          {@code Dataset}
   * @param  the type of {@code View} expected
   * @return a {@code View} for the given URI
   * @throws DatasetNotFoundException if there is no dataset for the given URI
   * @throws NullPointerException if any arguments are {@code null}
   * @throws IllegalArgumentException
   *          if {@code uri} is not a dataset or view URI
   */
  @SuppressWarnings("unchecked")
  public static > V load(URI uri, Class type) {
    boolean isView = URIBuilder.VIEW_SCHEME.equals(uri.getScheme());
    Preconditions.checkArgument(isView ||
        URIBuilder.DATASET_SCHEME.equals(uri.getScheme()),
        "Not a dataset or view URI: " + uri);
    Preconditions.checkNotNull(type,
        "The entity type can't be null, use Object.class to have the type"
        + " determined by the schema.");

    Pair> pair =
        Registration.lookupDatasetUri(URI.create(uri.getRawSchemeSpecificPart()));
    DatasetRepository repo = pair.first();
    Map uriOptions = pair.second();

    Dataset dataset = repo.load(
        uriOptions.get(URIBuilder.NAMESPACE_OPTION),
        uriOptions.get(URIBuilder.DATASET_NAME_OPTION), type);

    if (isView) {
      return Datasets. view(dataset, uriOptions);
    } else {
      // if the URI isn't a view URI, only load the dataset
      return (V) dataset;
    }
  }

  /**
   * Load a {@link Dataset} or {@link View} for the given {@link URI}.
   * 

   * URIs must begin with {@code dataset:} or {@code view:}. The remainder of
   * the URI is implementation specific, depending on the dataset scheme.
   * 

   * If you use a dataset URI, {@code load} returns the unfiltered dataset.
   * If you use a view URI, {@code load} returns a {@code View} configured to
   * read  a subset of the dataset.
   * 
   * @param uri a {@code Dataset} or {@code View} URI
   * @param  the type of {@code View} expected
   * @return a {@code View} for the given URI
   * @throws DatasetNotFoundException if there is no dataset for the given URI
   * @throws NullPointerException if any arguments are {@code null}
   * @throws IllegalArgumentException
   *          if {@code uri} is not a dataset or view URI
   */
  @SuppressWarnings("unchecked")
  public static > V load(URI uri) {
    return Datasets.load(uri, GenericRecord.class);
  }

  /**
   * Load a {@link Dataset} or {@link View} for the given {@link URI}.
   * 

   * URIs must begin with {@code dataset:} or {@code view:}. The remainder of
   * the URI is implementation specific, depending on the dataset scheme.
   * 

   * If you use a dataset URI, {@code load} returns the unfiltered dataset.
   * If you use a view URI, {@code load} returns a {@code View} configured to 
   * read a subset of the dataset.
   *
   * @param uriString a {@code Dataset} or {@code View} URI
   * @param type a Java class that represents an entity in the dataset
   * @param  the type used for readers and writers created by this
   *          {@code Dataset}
   * @param  the type of {@code View} expected
   * @return a {@code View} for the given URI
   * @throws DatasetNotFoundException if there is no dataset for the given URI
   * @throws NullPointerException if any arguments are {@code null}
   * @throws IllegalArgumentException
   *          if {@code uri} is not a dataset or view URI
   */
  public static > V load(String uriString, Class type) {
    return Datasets. load(URI.create(uriString), type);
  }

  /**
   * Load a {@link Dataset} or {@link View} for the given {@link URI}.
   * 

   * URIs must begin with {@code dataset:} or {@code view:}. The remainder of
   * the URI is implementation specific, depending on the dataset scheme.
   * 

   * If you use a dataset URI, {@code load} returns the unfiltered dataset.
   * If you use a view URI, {@code load} returns a {@code View} configured to
   * read a subset of the dataset.
   *
   * @param uriString a {@code Dataset} or {@code View} URI
   * @param  the type of {@code View} expected
   * @return a {@code View} for the given URI
   * @throws DatasetNotFoundException if there is no dataset for the given URI
   * @throws NullPointerException if any arguments are {@code null}
   * @throws IllegalArgumentException
   *          if {@code uri} is not a dataset or view URI
   */
  public static > V load(String uriString) {
    return Datasets.load(
        uriString, GenericRecord.class);
  }

  /**
   * Create a {@link Dataset} for the given dataset or view URI.
   * {@code create} returns an empty dataset. You can use {@code DatasetWriter}
   * to populate your dataset.
   * 

   * URIs must begin with {@code dataset:} or {@code view:}. The remainder of
   * the URI is implementation specific, depending on the dataset scheme. If the
   * URI is a view URI, this method creates the underlying dataset and returns a
   * view of it.
   *
   * @param uri a {@code Dataset} or {@code View} URI
   * @param type a Java class that represents an entity in the dataset
   * @param  the type used for readers and writers created by this
   *          {@code Dataset}
   * @param  the type of {@code Dataset} or {@code View} expected
   * @return a newly created {@code Dataset} responsible for the given URI
   * @throws NullPointerException
   *          if {@code uri}, {@code descriptor}, or {@code type} is
   *          {@code null}
   * @throws IllegalArgumentException
   *          if {@code uri} is not a dataset or view URI
   * @throws DatasetExistsException
   *          if a {@code Dataset} for the given URI already exists
   * @throws IncompatibleSchemaException
   *          if the schema is not compatible with existing datasets with
   *          shared storage (for example, in the same HBase table)
   */
  @SuppressWarnings("unchecked")
  public static > V create(URI uri, DatasetDescriptor descriptor, Class type) {
    boolean isView = URIBuilder.VIEW_SCHEME.equals(uri.getScheme());
    Preconditions.checkArgument(isView ||
        URIBuilder.DATASET_SCHEME.equals(uri.getScheme()),
        "Not a dataset or view URI: " + uri);
    Preconditions.checkNotNull(type,
        "The entity type can't be null, use Object.class to have the type"
        + " determined by the schema.");

    Pair> pair =
        Registration.lookupDatasetUri(URI.create(uri.getRawSchemeSpecificPart()));
    DatasetRepository repo = pair.first();
    Map uriOptions = pair.second();

    if (descriptor.getLocation() == null && uriOptions.containsKey("location")) {
      descriptor = new DatasetDescriptor.Builder(descriptor)
          .location(uriOptions.get("location"))
          .build();
    }

    Dataset dataset = repo.create(
        uriOptions.get(URIBuilder.NAMESPACE_OPTION),
        uriOptions.get(URIBuilder.DATASET_NAME_OPTION), descriptor, type);

    if (isView) {
      return Datasets. view(dataset, uriOptions);
    } else {
      return (V) dataset;
    }
  }

  /**
   * Create a {@link Dataset} for the given dataset or view URI.
   * {@code create} returns an empty dataset. You can use {@code DatasetWriter}
   * to populate your dataset.
   * 

   * URIs must begin with {@code dataset:} or {@code view:}. The remainder of
   * the URI is implementation specific, depending on the dataset scheme. If the
   * URI is a view URI, this method creates the underlying dataset and returns a
   * view of it.
   *
   * @param uri a {@code Dataset} or {@code View} URI
   * @param  the type of {@code Dataset} or {@code View} expected
   * @return a newly created {@code Dataset} responsible for the given URI
   * @throws NullPointerException
   *          if {@code uri} or {@code descriptor} is {@code null}
   * @throws IllegalArgumentException
   *          if {@code uri} is not a dataset or view URI
   * @throws DatasetExistsException
   *          if a {@code Dataset} for the given URI already exists
   * @throws IncompatibleSchemaException
   *          if the schema is not compatible with existing datasets with
   *          shared storage (for example, in the same HBase table)
   */
  @SuppressWarnings("unchecked")
  public static > V create(URI uri, DatasetDescriptor descriptor) {
    return Datasets.create(
        uri, descriptor, GenericRecord.class);
  }

  /**
   * Create a {@link Dataset} for the given dataset or view URI string.
   * {@code create} returns an empty dataset. You can use {@code DatasetWriter}
   * to populate your dataset.
   * 

   * URIs must begin with {@code dataset:} or {@code view:}. The remainder of
   * the URI is implementation specific, depending on the dataset scheme. If the
   * URI is a view URI, this method creates the underlying dataset and returns a
   * view of it.
   *
   * @param uri a {@code Dataset} or {@code View} URI string
   * @param type a Java class that represents an entity in the dataset
   * @param  the type used for readers and writers created by this
   *          {@code Dataset}
   * @param  the type of {@code Dataset} or {@code View} expected
   * @return a newly created {@code Dataset} responsible for the given URI
   * @throws NullPointerException
   *          if {@code uri}, {@code descriptor}, or {@code type} is
   *          {@code null}
   * @throws IllegalArgumentException
   *          if {@code uri} is not a dataset or view URI
   * @throws DatasetExistsException
   *          if a {@code Dataset} for the given URI already exists
   * @throws IncompatibleSchemaException
   *          if the schema is not compatible with existing datasets with
   *          shared storage (for example, in the same HBase table)
   */
  public static > V create(String uri, DatasetDescriptor descriptor, Class type) {
    return Datasets. create(URI.create(uri), descriptor, type);
  }

  /**
   * Create a {@link Dataset} for the given dataset or view URI string.
   * {@code create} returns an empty dataset. You can use {@code DatasetWriter}
   * to populate your dataset.
   * 

   * URIs must begin with {@code dataset:} or {@code view:}. The remainder of
   * the URI is implementation specific, depending on the dataset scheme. If the
   * URI is a view URI, this method creates the underlying dataset and returns a
   * view of it.
   *
   * @param uri a {@code Dataset} or {@code View} URI string
   * @param  the type of {@code Dataset} or {@code View} expected
   * @return a newly created {@code Dataset} responsible for the given URI
   * @throws NullPointerException
   *          if {@code uri} or {@code descriptor} is {@code null}
   * @throws IllegalArgumentException
   *          if {@code uri} is not a dataset or view URI
   * @throws DatasetExistsException
   *          if a {@code Dataset} for the given URI already exists
   * @throws IncompatibleSchemaException
   *          if the schema is not compatible with existing datasets with
   *          shared storage (for example, in the same HBase table)
   */
  @SuppressWarnings("unchecked")
  public static > V create(String uri, DatasetDescriptor descriptor) {
    return Datasets.create(
        uri, descriptor, GenericRecord.class);
  }

  /**
   * Update a {@link Dataset} for the given dataset or view URI.
   * 

   * You can add columns, remove columns, or change the data type of columns 
   * in your dataset, provided you don't attempt a change that is incompatible
   * with written data. Avro defines rules for compatible schema evolution. See 
   * Schema
   * Evolution.
   * 

   * This method updates the dataset descriptor, so you can also add
   * or change properties.
   * 

   * The recommended way to update a dataset descriptor is to build it
   * based on an existing descriptor. Use
   * {@link DatasetDescriptor.Builder(DatasetDescriptor)} to
   * build a DatasetDescriptor based on an existing instance.
   * 

   * You cannot change a dataset format or partition strategy.
   * 

   * URIs must begin with {@code dataset:}. The remainder of
   * the URI is implementation specific, depending on the dataset scheme.
   *
   * @param uri a {@code Dataset} URI
   * @param type a Java class that represents an entity in the dataset
   * @param  the type used for readers and writers created by this
   *          {@code Dataset}
   * @param  the type of {@code Dataset} expected
   * @return a {@code Dataset} for the given URI
   * @throws NullPointerException
   *          if {@code uri}, {@code descriptor}, or {@code type} is
   *          {@code null}
   * @throws IllegalArgumentException if {@code uri} is not a dataset URI
   * @throws DatasetNotFoundException
   *          if there is no dataset for the given URI
   * @throws UnsupportedOperationException
   *          if descriptor updates are not supported by the implementation
   * @throws ConcurrentSchemaModificationException
   *          if the {@code Dataset} schema is updated concurrently
   * @throws IncompatibleSchemaException
   *          if the schema is not compatible with previous schemas, or with
   *          existing datasets with shared storage (for example, in the same
   *          HBase table)
   */
  @SuppressWarnings("unchecked")
  public static > D update(
      URI uri, DatasetDescriptor descriptor, Class type) {
    Preconditions.checkArgument(
        URIBuilder.DATASET_SCHEME.equals(uri.getScheme()),
        "Not a dataset or view URI: " + uri);
    Preconditions.checkNotNull(type,
        "The entity type can't be null, use Object.class to have the type"
            + " determined by the schema.");

    Pair> pair =
        Registration.lookupDatasetUri(URI.create(uri.getRawSchemeSpecificPart()));
    DatasetRepository repo = pair.first();
    Map uriOptions = pair.second();

    return (D) repo.update(
        uriOptions.get(URIBuilder.NAMESPACE_OPTION),
        uriOptions.get(URIBuilder.DATASET_NAME_OPTION), descriptor, type);
  }

  /**
   * Update a {@link Dataset} for the given dataset or view URI.
   * 

   * You can add columns, remove columns, or change the data type of columns 
   * in your dataset, provided you don't attempt a change that is incompatible
   * with written data. Avro defines rules for compatible schema evolution. See 
   * Schema
   * Evolution.
   * 

   * This method updates the dataset descriptor, so you can also add
   * or change properties.
   * 

   * The recommended way to update a dataset descriptor is to build it
   * based on an existing descriptor. Use
   * {@link DatasetDescriptor.Builder(DatasetDescriptor)} to
   * build a DatasetDescriptor based on an existing instance.
   * 

   * You cannot change a dataset format or partition strategy.
   * 

   * URIs must begin with {@code dataset:}. The remainder of
   * the URI is implementation specific, depending on the dataset scheme.
   *
   * @param uri a {@code Dataset} URI
   * @param  the type of {@code Dataset} expected
   * @return a {@code Dataset} for the given URI
   * @throws NullPointerException
   *          if {@code uri} or {@code descriptor} is {@code null}
   * @throws IllegalArgumentException if {@code uri} is not a dataset URI
   * @throws DatasetNotFoundException
   *          if there is no dataset for the given URI
   * @throws UnsupportedOperationException
   *          if descriptor updates are not supported by the implementation
   * @throws ConcurrentSchemaModificationException
   *          if the {@code Dataset} schema is updated concurrently
   * @throws IncompatibleSchemaException
   *          if the schema is not compatible with previous schemas, or with
   *          existing datasets with shared storage (for example, in the same
   *          HBase table)
   */
  @SuppressWarnings("unchecked")
  public static > D update(
      URI uri, DatasetDescriptor descriptor) {
    return Datasets.update(
        uri, descriptor, GenericRecord.class);
  }

  /**
   * Update a {@link Dataset} for the given dataset or view URI string.
   * 

   * You can add columns, remove columns, or change the data type of columns 
   * in your dataset, provided you don't attempt a change that is incompatible
   * with written data. Avro defines rules for compatible schema evolution. See 
   * Schema
   * Evolution.
   * 

   * This method updates the dataset descriptor, so you can also add
   * or change properties.
   * 

   * The recommended way to update a dataset descriptor is to build it
   * based on an existing descriptor. Use
   * {@link DatasetDescriptor.Builder(DatasetDescriptor)} to
   * build a DatasetDescriptor based on an existing instance.
   * 

   * You cannot change a dataset format or partition strategy.
   * 

   * URIs must begin with {@code dataset:}. The remainder of
   * the URI is implementation specific, depending on the dataset scheme.
   *
   * @param uri a {@code Dataset} URI string
   * @param type a Java class that represents an entity in the dataset
   * @param  the type used for readers and writers created by this
   *          {@code Dataset}
   * @param  the type of {@code Dataset} expected
   * @return a {@code Dataset} for the given URI
   * @throws NullPointerException
   *          if {@code uri}, {@code descriptor}, or {@code type} is
   *          {@code null}
   * @throws IllegalArgumentException if {@code uri} is not a dataset URI
   * @throws DatasetNotFoundException
   *          if there is no dataset for the given URI
   * @throws UnsupportedOperationException
   *          if descriptor updates are not supported by the implementation
   * @throws ConcurrentSchemaModificationException
   *          if the {@code Dataset} schema is updated concurrently
   * @throws IncompatibleSchemaException
   *          if the schema is not compatible with previous schemas, or with
   *          existing datasets with shared storage (for example, in the same
   *          HBase table)
   */
  public static > D update(String uri, DatasetDescriptor descriptor, Class type) {
    return Datasets. update(URI.create(uri), descriptor, type);
  }

  /**
   * Update a {@link Dataset} for the given dataset or view URI string.
   * 

   * You can add columns, remove columns, or change the data type of columns 
   * in your dataset, provided you don't attempt a change that is incompatible
   * with written data. Avro defines rules for compatible schema evolution. See 
   * Schema
   * Evolution.
   * 

   * This method updates the dataset descriptor, so you can also add
   * or change properties.
   * 

   * The recommended way to update a dataset descriptor is to build it
   * based on an existing descriptor. Use
   * {@link DatasetDescriptor.Builder(DatasetDescriptor)} to
   * build a DatasetDescriptor based on an existing instance.
   * 

   * You cannot change a dataset format or partition strategy.
   * 

   * URIs must begin with {@code dataset:}. The remainder of
   * the URI is implementation specific, depending on the dataset scheme.
   *
   * @param uri a {@code Dataset} URI string
   * @param  the type of {@code Dataset} expected
   * @return a {@code Dataset} for the given URI
   * @throws NullPointerException
   *          if {@code uri} or {@code descriptor} is {@code null}
   * @throws IllegalArgumentException if {@code uri} is not a dataset URI
   * @throws DatasetNotFoundException
   *          if there is no dataset for the given URI
   * @throws UnsupportedOperationException
   *          if descriptor updates are not supported by the implementation
   * @throws ConcurrentSchemaModificationException
   *          if the {@code Dataset} schema is updated concurrently
   * @throws IncompatibleSchemaException
   *          if the schema is not compatible with previous schemas, or with
   *          existing datasets with shared storage (for example, in the same
   *          HBase table)
   */
  public static > D update(String uri, DatasetDescriptor descriptor) {
    return Datasets.update(
        uri, descriptor, GenericRecord.class);
  }

  /**
   * Delete a {@link Dataset} identified by the given dataset URI.
   * 

   * When you call this method using a dataset URI, both data and metadata are
   * deleted. After you call this method, the dataset no longer exists, unless
   * an exception is thrown.
   * 

   * When you call this method using a view URI, data in that view is deleted.
   * The dataset's metadata is not changed. This can throw an
   * {@code UnsupportedOperationException} if the delete requires additional
   * work. For example, if some, but not all, of the data in an underlying data
   * file must be removed, then the implementation is allowed to reject the
   * deletion rather than copy the remaining records to a new file.
   * An implementation must document under what conditions it accepts deletes,
   * and under what conditions it rejects them.
   * 

   * URIs must begin with {@code dataset:}. The remainder of
   * the URI is implementation specific, depending on the dataset scheme.
   *
   * @param uri a {@code Dataset} URI
   * @return {@code true} if any data or metadata is removed, {@code false}
   *          otherwise
   * @throws NullPointerException if {@code uri} is null
   * @throws IllegalArgumentException if {@code uri} is not a dataset URI
   */
  public static boolean delete(URI uri) {
    Preconditions.checkArgument(
        URIBuilder.DATASET_SCHEME.equals(uri.getScheme()),
        "Not a dataset URI: " + uri);

    Pair> pair =
        Registration.lookupDatasetUri(URI.create(uri.getRawSchemeSpecificPart()));
    DatasetRepository repo = pair.first();
    Map uriOptions = pair.second();

    return repo.delete(
        uriOptions.get(URIBuilder.NAMESPACE_OPTION),
        uriOptions.get(URIBuilder.DATASET_NAME_OPTION));
  }

  /**
   * Delete a {@link Dataset} identified by the given dataset URI string.  
   * 

   * When you call this method using a dataset URI, both data and metadata are
   * deleted. After you call this method, the dataset no longer exists, unless
   * an exception is thrown.
   * 

   * When you call this method using a view URI, data in that view is deleted.
   * The dataset's metadata is not changed. This can throw an
   * {@code UnsupportedOperationException} if the delete requires additional
   * work. For example, if some, but not all, of the data in an underlying data
   * file must be removed, then the implementation is allowed to reject the
   * deletion rather than copy the remaining records to a new file.
   * An implementation must document under what conditions it accepts deletes,
   * and under what conditions it rejects them.
   * 

   * URIs must begin with {@code dataset:}. The remainder of
   * the URI is implementation specific, depending on the dataset scheme.
   *
   * @param uri a {@code Dataset} URI string
   * @return {@code true} if any data or metadata is removed, {@code false}
   *          otherwise
   * @throws NullPointerException if {@code uri} is null
   * @throws IllegalArgumentException if {@code uri} is not a dataset URI
   */
  public static boolean delete(String uri) {
    return delete(URI.create(uri));
  }

  /**
   * Check whether a {@link Dataset} identified by the given URI exists.
   * 

   * URIs must begin with {@code dataset:}. The remainder of
   * the URI is implementation specific, depending on the dataset scheme.
   *
   * @param uri a {@code Dataset} URI
   * @return {@code true} if the dataset exists, {@code false} otherwise
   * @throws NullPointerException if {@code uri} is null
   * @throws IllegalArgumentException if {@code uri} is not a dataset URI
   */
  public static boolean exists(URI uri) {
    Preconditions.checkArgument(
        URIBuilder.DATASET_SCHEME.equals(uri.getScheme()),
        "Not a dataset URI: " + uri);

    Pair> pair =
        Registration.lookupDatasetUri(URI.create(uri.getRawSchemeSpecificPart()));
    DatasetRepository repo = pair.first();
    Map uriOptions = pair.second();

    return repo.exists(
        uriOptions.get(URIBuilder.NAMESPACE_OPTION),
        uriOptions.get(URIBuilder.DATASET_NAME_OPTION));
  }

  /**
   * Check whether a {@link Dataset} identified by the given URI string exists.
   * 

   * URIs must begin with {@code dataset:}. The remainder of
   * the URI is implementation specific, depending on the dataset scheme.
   *
   * @param uri a {@code Dataset} URI string
   * @return {@code true} if the dataset exists, {@code false} otherwise
   * @throws NullPointerException if {@code uri} is null
   * @throws IllegalArgumentException if {@code uri} is not a dataset URI
   */
  public static boolean exists(String uri) {
    return exists(URI.create(uri));
  }

  /**
   * List the {@link Dataset} URIs in the repository identified by the URI.
   * 

   * URI formats are defined by {@code Dataset} implementations. The repository
   * URIs you pass to this method must begin with {@code repo:}. For example, to 
   * list the {@code Dataset} URIs for the Hive repository, provide the URI
   * {@code repo:hive}.
   *
   * @param uri a {@code DatasetRepository} URI
   * @return the URIs present in the {@code DatasetRepository}
   * @throws NullPointerException if {@code uri} is null
   * @throws IllegalArgumentException if {@code uri} is not a repository URI
   */
  public static Collection list(URI uri) {
    boolean isRepo = URIBuilder.REPO_SCHEME.equals(uri.getScheme());
    Preconditions.checkArgument(isRepo, "Not a repository URI: " + uri);
    DatasetRepository repo = Registration
        .open(URI.create(uri.getRawSchemeSpecificPart()));

    // build a URI for each dataset name
    URI repoUri = repo.getUri();
    List datasets = Lists.newArrayList();
    for (String namespace : repo.namespaces()) {
      for (String dataset : repo.datasets(namespace)) {
        datasets.add(new URIBuilder(repoUri, namespace, dataset).build());
      }
    }

    return datasets;
  }

  /**
   * List the {@link Dataset} URIs in the repository identified by the URI
   * string.
   * 
   * URI formats are defined by {@code Dataset} implementations. The repository
   * URIs you pass to this method must begin with {@code repo:}. For example, to 
   * list the {@code Dataset} URIs for the Hive repository, provide the URI
   * {@code repo:hive}.
   * 
   * @param uri a {@code DatasetRepository} URI string
   * @return the URIs present in the {@code DatasetRepository}
   * @throws NullPointerException if {@code URI} is null
   * @throws IllegalArgumentException if {@code uri} is not a repository URI
   */
  public static Collection list(String uri) {
    return list(URI.create(uri));
  }

  @SuppressWarnings("unchecked")
  private static > V view(Dataset dataset,
                                               Map uriOptions) {
    if (dataset instanceof AbstractDataset) {
      DatasetDescriptor descriptor = dataset.getDescriptor();
      Schema schema = descriptor.getSchema();
      PartitionStrategy strategy = null;
      if (descriptor.isPartitioned()) {
        strategy = descriptor.getPartitionStrategy();
      }
      Constraints constraints = Constraints.fromQueryMap(
          schema, strategy, uriOptions);
      return (V) ((AbstractDataset) dataset).filter(constraints);
    } else {
      return (V) dataset;
    }
  }
}