All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.kitesdk.data.Datasets Maven / Gradle / Ivy

Go to download

The Kite Data Core module provides simple, intuitive APIs for working with datasets in the Hadoop Platform.

There is a newer version: 1.1.0
Show newest version
/*
 * Copyright 2013 Cloudera Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.kitesdk.data;

import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import java.net.URI;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.kitesdk.data.spi.DatasetRepository;
import org.kitesdk.data.spi.AbstractDataset;
import org.kitesdk.data.spi.Constraints;
import org.kitesdk.data.spi.Pair;
import org.kitesdk.data.spi.Registration;

/**
 * Methods for working with {@link Dataset} instances.
 * 

* URIs *

* All methods require a URI that identifies a dataset, view, or * repository. The URI must begin with the scheme {@code dataset:}, * {@code view:}, or {@code repo:}. The remainder of the URI is * implementation specific, depending on the dataset scheme. *

* For example, the URI {@code dataset:hive:movies/ratings} * references a dataset named ratings in the * movies namespace, stored in Hive. *

* The URI {@code view:hive:movies/ratings?year=2015&month=3} * references a view of the same ratings dataset. The view * is filtered to include records from only March, 2015. *

* See Dataset and View * URIs for the available URI patterns. *

* Dataset Descriptors *

* Some methods require a {@link DatasetDescriptor} that encapsulates metadata * about a dataset. Descriptors are built using a * {@link DatasetDescriptor.Builder descriptor builder}. *

* Entities *

* Entities are analagous to records in database terminology. * The term is used in the API to emphasize that an entity can include not * only primitive objects, but also complex objects such as hash maps. *

* Some methods accept an entity class that will be used by Kite when returning * entities from a dataset or view. * * @since 0.8.0 */ public class Datasets { /** * Load a {@link Dataset} or {@link View} for the given {@link URI}. *

* URIs must begin with {@code dataset:} or {@code view:}. The remainder of * the URI is implementation specific, depending on the dataset scheme. *

* If you use a dataset URI, {@code load} returns the unfiltered dataset. * If you use a view URI, {@code load} returns a {@code View} configured to * read a subset of the dataset. * * @param uri a {@code Dataset} or {@code View} URI * @param type a Java class that represents an entity in the dataset * @param the type used for readers and writers created by this * {@code Dataset} * @param the type of {@code View} expected * @return a {@code View} for the given URI * @throws DatasetNotFoundException if there is no dataset for the given URI * @throws NullPointerException if any arguments are {@code null} * @throws IllegalArgumentException * if {@code uri} is not a dataset or view URI */ @SuppressWarnings("unchecked") public static > V load(URI uri, Class type) { boolean isView = URIBuilder.VIEW_SCHEME.equals(uri.getScheme()); Preconditions.checkArgument(isView || URIBuilder.DATASET_SCHEME.equals(uri.getScheme()), "Not a dataset or view URI: " + uri); Preconditions.checkNotNull(type, "The entity type can't be null, use Object.class to have the type" + " determined by the schema."); Pair> pair = Registration.lookupDatasetUri(URI.create(uri.getRawSchemeSpecificPart())); DatasetRepository repo = pair.first(); Map uriOptions = pair.second(); Dataset dataset = repo.load( uriOptions.get(URIBuilder.NAMESPACE_OPTION), uriOptions.get(URIBuilder.DATASET_NAME_OPTION), type); if (isView) { return Datasets. view(dataset, uriOptions); } else { // if the URI isn't a view URI, only load the dataset return (V) dataset; } } /** * Load a {@link Dataset} or {@link View} for the given {@link URI}. *

* URIs must begin with {@code dataset:} or {@code view:}. The remainder of * the URI is implementation specific, depending on the dataset scheme. *

* If you use a dataset URI, {@code load} returns the unfiltered dataset. * If you use a view URI, {@code load} returns a {@code View} configured to * read a subset of the dataset. * * @param uri a {@code Dataset} or {@code View} URI * @param the type of {@code View} expected * @return a {@code View} for the given URI * @throws DatasetNotFoundException if there is no dataset for the given URI * @throws NullPointerException if any arguments are {@code null} * @throws IllegalArgumentException * if {@code uri} is not a dataset or view URI */ @SuppressWarnings("unchecked") public static > V load(URI uri) { return Datasets.load(uri, GenericRecord.class); } /** * Load a {@link Dataset} or {@link View} for the given {@link URI}. *

* URIs must begin with {@code dataset:} or {@code view:}. The remainder of * the URI is implementation specific, depending on the dataset scheme. *

* If you use a dataset URI, {@code load} returns the unfiltered dataset. * If you use a view URI, {@code load} returns a {@code View} configured to * read a subset of the dataset. * * @param uriString a {@code Dataset} or {@code View} URI * @param type a Java class that represents an entity in the dataset * @param the type used for readers and writers created by this * {@code Dataset} * @param the type of {@code View} expected * @return a {@code View} for the given URI * @throws DatasetNotFoundException if there is no dataset for the given URI * @throws NullPointerException if any arguments are {@code null} * @throws IllegalArgumentException * if {@code uri} is not a dataset or view URI */ public static > V load(String uriString, Class type) { return Datasets. load(URI.create(uriString), type); } /** * Load a {@link Dataset} or {@link View} for the given {@link URI}. *

* URIs must begin with {@code dataset:} or {@code view:}. The remainder of * the URI is implementation specific, depending on the dataset scheme. *

* If you use a dataset URI, {@code load} returns the unfiltered dataset. * If you use a view URI, {@code load} returns a {@code View} configured to * read a subset of the dataset. * * @param uriString a {@code Dataset} or {@code View} URI * @param the type of {@code View} expected * @return a {@code View} for the given URI * @throws DatasetNotFoundException if there is no dataset for the given URI * @throws NullPointerException if any arguments are {@code null} * @throws IllegalArgumentException * if {@code uri} is not a dataset or view URI */ public static > V load(String uriString) { return Datasets.load( uriString, GenericRecord.class); } /** * Create a {@link Dataset} for the given dataset or view URI. * {@code create} returns an empty dataset. You can use {@code DatasetWriter} * to populate your dataset. *

* URIs must begin with {@code dataset:} or {@code view:}. The remainder of * the URI is implementation specific, depending on the dataset scheme. If the * URI is a view URI, this method creates the underlying dataset and returns a * view of it. * * @param uri a {@code Dataset} or {@code View} URI * @param type a Java class that represents an entity in the dataset * @param the type used for readers and writers created by this * {@code Dataset} * @param the type of {@code Dataset} or {@code View} expected * @return a newly created {@code Dataset} responsible for the given URI * @throws NullPointerException * if {@code uri}, {@code descriptor}, or {@code type} is * {@code null} * @throws IllegalArgumentException * if {@code uri} is not a dataset or view URI * @throws DatasetExistsException * if a {@code Dataset} for the given URI already exists * @throws IncompatibleSchemaException * if the schema is not compatible with existing datasets with * shared storage (for example, in the same HBase table) */ @SuppressWarnings("unchecked") public static > V create(URI uri, DatasetDescriptor descriptor, Class type) { boolean isView = URIBuilder.VIEW_SCHEME.equals(uri.getScheme()); Preconditions.checkArgument(isView || URIBuilder.DATASET_SCHEME.equals(uri.getScheme()), "Not a dataset or view URI: " + uri); Preconditions.checkNotNull(type, "The entity type can't be null, use Object.class to have the type" + " determined by the schema."); Pair> pair = Registration.lookupDatasetUri(URI.create(uri.getRawSchemeSpecificPart())); DatasetRepository repo = pair.first(); Map uriOptions = pair.second(); if (descriptor.getLocation() == null && uriOptions.containsKey("location")) { descriptor = new DatasetDescriptor.Builder(descriptor) .location(uriOptions.get("location")) .build(); } Dataset dataset = repo.create( uriOptions.get(URIBuilder.NAMESPACE_OPTION), uriOptions.get(URIBuilder.DATASET_NAME_OPTION), descriptor, type); if (isView) { return Datasets. view(dataset, uriOptions); } else { return (V) dataset; } } /** * Create a {@link Dataset} for the given dataset or view URI. * {@code create} returns an empty dataset. You can use {@code DatasetWriter} * to populate your dataset. *

* URIs must begin with {@code dataset:} or {@code view:}. The remainder of * the URI is implementation specific, depending on the dataset scheme. If the * URI is a view URI, this method creates the underlying dataset and returns a * view of it. * * @param uri a {@code Dataset} or {@code View} URI * @param the type of {@code Dataset} or {@code View} expected * @return a newly created {@code Dataset} responsible for the given URI * @throws NullPointerException * if {@code uri} or {@code descriptor} is {@code null} * @throws IllegalArgumentException * if {@code uri} is not a dataset or view URI * @throws DatasetExistsException * if a {@code Dataset} for the given URI already exists * @throws IncompatibleSchemaException * if the schema is not compatible with existing datasets with * shared storage (for example, in the same HBase table) */ @SuppressWarnings("unchecked") public static > V create(URI uri, DatasetDescriptor descriptor) { return Datasets.create( uri, descriptor, GenericRecord.class); } /** * Create a {@link Dataset} for the given dataset or view URI string. * {@code create} returns an empty dataset. You can use {@code DatasetWriter} * to populate your dataset. *

* URIs must begin with {@code dataset:} or {@code view:}. The remainder of * the URI is implementation specific, depending on the dataset scheme. If the * URI is a view URI, this method creates the underlying dataset and returns a * view of it. * * @param uri a {@code Dataset} or {@code View} URI string * @param type a Java class that represents an entity in the dataset * @param the type used for readers and writers created by this * {@code Dataset} * @param the type of {@code Dataset} or {@code View} expected * @return a newly created {@code Dataset} responsible for the given URI * @throws NullPointerException * if {@code uri}, {@code descriptor}, or {@code type} is * {@code null} * @throws IllegalArgumentException * if {@code uri} is not a dataset or view URI * @throws DatasetExistsException * if a {@code Dataset} for the given URI already exists * @throws IncompatibleSchemaException * if the schema is not compatible with existing datasets with * shared storage (for example, in the same HBase table) */ public static > V create(String uri, DatasetDescriptor descriptor, Class type) { return Datasets. create(URI.create(uri), descriptor, type); } /** * Create a {@link Dataset} for the given dataset or view URI string. * {@code create} returns an empty dataset. You can use {@code DatasetWriter} * to populate your dataset. *

* URIs must begin with {@code dataset:} or {@code view:}. The remainder of * the URI is implementation specific, depending on the dataset scheme. If the * URI is a view URI, this method creates the underlying dataset and returns a * view of it. * * @param uri a {@code Dataset} or {@code View} URI string * @param the type of {@code Dataset} or {@code View} expected * @return a newly created {@code Dataset} responsible for the given URI * @throws NullPointerException * if {@code uri} or {@code descriptor} is {@code null} * @throws IllegalArgumentException * if {@code uri} is not a dataset or view URI * @throws DatasetExistsException * if a {@code Dataset} for the given URI already exists * @throws IncompatibleSchemaException * if the schema is not compatible with existing datasets with * shared storage (for example, in the same HBase table) */ @SuppressWarnings("unchecked") public static > V create(String uri, DatasetDescriptor descriptor) { return Datasets.create( uri, descriptor, GenericRecord.class); } /** * Update a {@link Dataset} for the given dataset or view URI. *

* You can add columns, remove columns, or change the data type of columns * in your dataset, provided you don't attempt a change that is incompatible * with written data. Avro defines rules for compatible schema evolution. See * Schema * Evolution. *

* This method updates the dataset descriptor, so you can also add * or change properties. *

* The recommended way to update a dataset descriptor is to build it * based on an existing descriptor. Use * {@link DatasetDescriptor.Builder(DatasetDescriptor)} to * build a DatasetDescriptor based on an existing instance. *

* You cannot change a dataset format or partition strategy. *

* URIs must begin with {@code dataset:}. The remainder of * the URI is implementation specific, depending on the dataset scheme. * * @param uri a {@code Dataset} URI * @param type a Java class that represents an entity in the dataset * @param the type used for readers and writers created by this * {@code Dataset} * @param the type of {@code Dataset} expected * @return a {@code Dataset} for the given URI * @throws NullPointerException * if {@code uri}, {@code descriptor}, or {@code type} is * {@code null} * @throws IllegalArgumentException if {@code uri} is not a dataset URI * @throws DatasetNotFoundException * if there is no dataset for the given URI * @throws UnsupportedOperationException * if descriptor updates are not supported by the implementation * @throws ConcurrentSchemaModificationException * if the {@code Dataset} schema is updated concurrently * @throws IncompatibleSchemaException * if the schema is not compatible with previous schemas, or with * existing datasets with shared storage (for example, in the same * HBase table) */ @SuppressWarnings("unchecked") public static > D update( URI uri, DatasetDescriptor descriptor, Class type) { Preconditions.checkArgument( URIBuilder.DATASET_SCHEME.equals(uri.getScheme()), "Not a dataset or view URI: " + uri); Preconditions.checkNotNull(type, "The entity type can't be null, use Object.class to have the type" + " determined by the schema."); Pair> pair = Registration.lookupDatasetUri(URI.create(uri.getRawSchemeSpecificPart())); DatasetRepository repo = pair.first(); Map uriOptions = pair.second(); return (D) repo.update( uriOptions.get(URIBuilder.NAMESPACE_OPTION), uriOptions.get(URIBuilder.DATASET_NAME_OPTION), descriptor, type); } /** * Update a {@link Dataset} for the given dataset or view URI. *

* You can add columns, remove columns, or change the data type of columns * in your dataset, provided you don't attempt a change that is incompatible * with written data. Avro defines rules for compatible schema evolution. See * Schema * Evolution. *

* This method updates the dataset descriptor, so you can also add * or change properties. *

* The recommended way to update a dataset descriptor is to build it * based on an existing descriptor. Use * {@link DatasetDescriptor.Builder(DatasetDescriptor)} to * build a DatasetDescriptor based on an existing instance. *

* You cannot change a dataset format or partition strategy. *

* URIs must begin with {@code dataset:}. The remainder of * the URI is implementation specific, depending on the dataset scheme. * * @param uri a {@code Dataset} URI * @param the type of {@code Dataset} expected * @return a {@code Dataset} for the given URI * @throws NullPointerException * if {@code uri} or {@code descriptor} is {@code null} * @throws IllegalArgumentException if {@code uri} is not a dataset URI * @throws DatasetNotFoundException * if there is no dataset for the given URI * @throws UnsupportedOperationException * if descriptor updates are not supported by the implementation * @throws ConcurrentSchemaModificationException * if the {@code Dataset} schema is updated concurrently * @throws IncompatibleSchemaException * if the schema is not compatible with previous schemas, or with * existing datasets with shared storage (for example, in the same * HBase table) */ @SuppressWarnings("unchecked") public static > D update( URI uri, DatasetDescriptor descriptor) { return Datasets.update( uri, descriptor, GenericRecord.class); } /** * Update a {@link Dataset} for the given dataset or view URI string. *

* You can add columns, remove columns, or change the data type of columns * in your dataset, provided you don't attempt a change that is incompatible * with written data. Avro defines rules for compatible schema evolution. See * Schema * Evolution. *

* This method updates the dataset descriptor, so you can also add * or change properties. *

* The recommended way to update a dataset descriptor is to build it * based on an existing descriptor. Use * {@link DatasetDescriptor.Builder(DatasetDescriptor)} to * build a DatasetDescriptor based on an existing instance. *

* You cannot change a dataset format or partition strategy. *

* URIs must begin with {@code dataset:}. The remainder of * the URI is implementation specific, depending on the dataset scheme. * * @param uri a {@code Dataset} URI string * @param type a Java class that represents an entity in the dataset * @param the type used for readers and writers created by this * {@code Dataset} * @param the type of {@code Dataset} expected * @return a {@code Dataset} for the given URI * @throws NullPointerException * if {@code uri}, {@code descriptor}, or {@code type} is * {@code null} * @throws IllegalArgumentException if {@code uri} is not a dataset URI * @throws DatasetNotFoundException * if there is no dataset for the given URI * @throws UnsupportedOperationException * if descriptor updates are not supported by the implementation * @throws ConcurrentSchemaModificationException * if the {@code Dataset} schema is updated concurrently * @throws IncompatibleSchemaException * if the schema is not compatible with previous schemas, or with * existing datasets with shared storage (for example, in the same * HBase table) */ public static > D update(String uri, DatasetDescriptor descriptor, Class type) { return Datasets. update(URI.create(uri), descriptor, type); } /** * Update a {@link Dataset} for the given dataset or view URI string. *

* You can add columns, remove columns, or change the data type of columns * in your dataset, provided you don't attempt a change that is incompatible * with written data. Avro defines rules for compatible schema evolution. See * Schema * Evolution. *

* This method updates the dataset descriptor, so you can also add * or change properties. *

* The recommended way to update a dataset descriptor is to build it * based on an existing descriptor. Use * {@link DatasetDescriptor.Builder(DatasetDescriptor)} to * build a DatasetDescriptor based on an existing instance. *

* You cannot change a dataset format or partition strategy. *

* URIs must begin with {@code dataset:}. The remainder of * the URI is implementation specific, depending on the dataset scheme. * * @param uri a {@code Dataset} URI string * @param the type of {@code Dataset} expected * @return a {@code Dataset} for the given URI * @throws NullPointerException * if {@code uri} or {@code descriptor} is {@code null} * @throws IllegalArgumentException if {@code uri} is not a dataset URI * @throws DatasetNotFoundException * if there is no dataset for the given URI * @throws UnsupportedOperationException * if descriptor updates are not supported by the implementation * @throws ConcurrentSchemaModificationException * if the {@code Dataset} schema is updated concurrently * @throws IncompatibleSchemaException * if the schema is not compatible with previous schemas, or with * existing datasets with shared storage (for example, in the same * HBase table) */ public static > D update(String uri, DatasetDescriptor descriptor) { return Datasets.update( uri, descriptor, GenericRecord.class); } /** * Delete a {@link Dataset} identified by the given dataset URI. *

* When you call this method using a dataset URI, both data and metadata are * deleted. After you call this method, the dataset no longer exists, unless * an exception is thrown. *

* When you call this method using a view URI, data in that view is deleted. * The dataset's metadata is not changed. This can throw an * {@code UnsupportedOperationException} if the delete requires additional * work. For example, if some, but not all, of the data in an underlying data * file must be removed, then the implementation is allowed to reject the * deletion rather than copy the remaining records to a new file. * An implementation must document under what conditions it accepts deletes, * and under what conditions it rejects them. *

* URIs must begin with {@code dataset:}. The remainder of * the URI is implementation specific, depending on the dataset scheme. * * @param uri a {@code Dataset} URI * @return {@code true} if any data or metadata is removed, {@code false} * otherwise * @throws NullPointerException if {@code uri} is null * @throws IllegalArgumentException if {@code uri} is not a dataset URI */ public static boolean delete(URI uri) { Preconditions.checkArgument( URIBuilder.DATASET_SCHEME.equals(uri.getScheme()), "Not a dataset URI: " + uri); Pair> pair = Registration.lookupDatasetUri(URI.create(uri.getRawSchemeSpecificPart())); DatasetRepository repo = pair.first(); Map uriOptions = pair.second(); return repo.delete( uriOptions.get(URIBuilder.NAMESPACE_OPTION), uriOptions.get(URIBuilder.DATASET_NAME_OPTION)); } /** * Delete a {@link Dataset} identified by the given dataset URI string. *

* When you call this method using a dataset URI, both data and metadata are * deleted. After you call this method, the dataset no longer exists, unless * an exception is thrown. *

* When you call this method using a view URI, data in that view is deleted. * The dataset's metadata is not changed. This can throw an * {@code UnsupportedOperationException} if the delete requires additional * work. For example, if some, but not all, of the data in an underlying data * file must be removed, then the implementation is allowed to reject the * deletion rather than copy the remaining records to a new file. * An implementation must document under what conditions it accepts deletes, * and under what conditions it rejects them. *

* URIs must begin with {@code dataset:}. The remainder of * the URI is implementation specific, depending on the dataset scheme. * * @param uri a {@code Dataset} URI string * @return {@code true} if any data or metadata is removed, {@code false} * otherwise * @throws NullPointerException if {@code uri} is null * @throws IllegalArgumentException if {@code uri} is not a dataset URI */ public static boolean delete(String uri) { return delete(URI.create(uri)); } /** * Check whether a {@link Dataset} identified by the given URI exists. *

* URIs must begin with {@code dataset:}. The remainder of * the URI is implementation specific, depending on the dataset scheme. * * @param uri a {@code Dataset} URI * @return {@code true} if the dataset exists, {@code false} otherwise * @throws NullPointerException if {@code uri} is null * @throws IllegalArgumentException if {@code uri} is not a dataset URI */ public static boolean exists(URI uri) { Preconditions.checkArgument( URIBuilder.DATASET_SCHEME.equals(uri.getScheme()), "Not a dataset URI: " + uri); Pair> pair = Registration.lookupDatasetUri(URI.create(uri.getRawSchemeSpecificPart())); DatasetRepository repo = pair.first(); Map uriOptions = pair.second(); return repo.exists( uriOptions.get(URIBuilder.NAMESPACE_OPTION), uriOptions.get(URIBuilder.DATASET_NAME_OPTION)); } /** * Check whether a {@link Dataset} identified by the given URI string exists. *

* URIs must begin with {@code dataset:}. The remainder of * the URI is implementation specific, depending on the dataset scheme. * * @param uri a {@code Dataset} URI string * @return {@code true} if the dataset exists, {@code false} otherwise * @throws NullPointerException if {@code uri} is null * @throws IllegalArgumentException if {@code uri} is not a dataset URI */ public static boolean exists(String uri) { return exists(URI.create(uri)); } /** * List the {@link Dataset} URIs in the repository identified by the URI. *

* URI formats are defined by {@code Dataset} implementations. The repository * URIs you pass to this method must begin with {@code repo:}. For example, to * list the {@code Dataset} URIs for the Hive repository, provide the URI * {@code repo:hive}. * * @param uri a {@code DatasetRepository} URI * @return the URIs present in the {@code DatasetRepository} * @throws NullPointerException if {@code uri} is null * @throws IllegalArgumentException if {@code uri} is not a repository URI */ public static Collection list(URI uri) { boolean isRepo = URIBuilder.REPO_SCHEME.equals(uri.getScheme()); Preconditions.checkArgument(isRepo, "Not a repository URI: " + uri); DatasetRepository repo = Registration .open(URI.create(uri.getRawSchemeSpecificPart())); // build a URI for each dataset name URI repoUri = repo.getUri(); List datasets = Lists.newArrayList(); for (String namespace : repo.namespaces()) { for (String dataset : repo.datasets(namespace)) { datasets.add(new URIBuilder(repoUri, namespace, dataset).build()); } } return datasets; } /** * List the {@link Dataset} URIs in the repository identified by the URI * string. *

* URI formats are defined by {@code Dataset} implementations. The repository * URIs you pass to this method must begin with {@code repo:}. For example, to * list the {@code Dataset} URIs for the Hive repository, provide the URI * {@code repo:hive}. * * @param uri a {@code DatasetRepository} URI string * @return the URIs present in the {@code DatasetRepository} * @throws NullPointerException if {@code URI} is null * @throws IllegalArgumentException if {@code uri} is not a repository URI */ public static Collection list(String uri) { return list(URI.create(uri)); } @SuppressWarnings("unchecked") private static > V view(Dataset dataset, Map uriOptions) { if (dataset instanceof AbstractDataset) { DatasetDescriptor descriptor = dataset.getDescriptor(); Schema schema = descriptor.getSchema(); PartitionStrategy strategy = null; if (descriptor.isPartitioned()) { strategy = descriptor.getPartitionStrategy(); } Constraints constraints = Constraints.fromQueryMap( schema, strategy, uriOptions); return (V) ((AbstractDataset) dataset).filter(constraints); } else { return (V) dataset; } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy