org.kitesdk.data.Datasets Maven / Gradle / Ivy
Show all versions of kite-data-core Show documentation
/*
* Copyright 2013 Cloudera Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.kitesdk.data;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import java.net.URI;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.kitesdk.data.spi.DatasetRepository;
import org.kitesdk.data.spi.AbstractDataset;
import org.kitesdk.data.spi.Constraints;
import org.kitesdk.data.spi.Pair;
import org.kitesdk.data.spi.Registration;
/**
* Convenience methods for working with {@link Dataset} instances.
*
* @since 0.8.0
*/
public class Datasets {
/**
* Load a {@link Dataset} or {@link View} for the given {@link URI}.
*
* If the URI is a dataset URI, the unfiltered Dataset will be returned.
*
* URI formats are defined by {@code Dataset} implementations, but must begin
* with "dataset:" or "view:".
*
* @param uri a {@code Dataset} or {@code View} URI.
* @param type the Java type of the entities in the dataset
* @param The type of entities stored in the {@code Dataset}.
* @param The type of {@code View} expected.
* @return a {@code View} for the given URI.
*/
@SuppressWarnings("unchecked")
public static > V load(URI uri, Class type) {
boolean isView = URIBuilder.VIEW_SCHEME.equals(uri.getScheme());
Preconditions.checkArgument(isView ||
URIBuilder.DATASET_SCHEME.equals(uri.getScheme()),
"Not a dataset or view URI: " + uri);
Preconditions.checkNotNull(type,
"The entity type can't be null, use Object.class to have the type"
+ " determined by the schema.");
Pair> pair =
Registration.lookupDatasetUri(URI.create(uri.getRawSchemeSpecificPart()));
DatasetRepository repo = pair.first();
Map uriOptions = pair.second();
Dataset dataset = repo.load(
uriOptions.get(URIBuilder.NAMESPACE_OPTION),
uriOptions.get(URIBuilder.DATASET_NAME_OPTION), type);
if (isView) {
return Datasets. view(dataset, uriOptions);
} else {
// if the URI isn't a view URI, only load the dataset
return (V) dataset;
}
}
/**
* Load a {@link Dataset} or {@link View} for the given {@link URI}.
*
* If the URI is a dataset URI, the unfiltered Dataset will be returned.
*
* URI formats are defined by {@code Dataset} implementations, but must begin
* with "dataset:" or "view:".
*
* @param uri a {@code Dataset} or {@code View} URI.
* @param The type of {@code View} expected.
* @return a {@code View} for the given URI.
*/
@SuppressWarnings("unchecked")
public static > V load(URI uri) {
return Datasets.load(uri, GenericRecord.class);
}
/**
* Load a {@link Dataset} or {@link View} for the given URI string.
*
* If the URI is a dataset URI, the unfiltered Dataset will be returned.
*
* URI formats are defined by {@code Dataset} implementations, but must begin
* with "dataset:" or "view:".
*
* @param uriString a {@code Dataset} or {@code View} URI.
* @param type the Java type of the entities in the dataset
* @param The type of entities stored in the {@code Dataset}.
* @param The type of {@code View} expected.
* @return a {@code View} for the given URI.
*/
public static > V load(String uriString, Class type) {
return Datasets. load(URI.create(uriString), type);
}
/**
* Load a {@link Dataset} or {@link View} for the given URI string.
*
* If the URI is a dataset URI, the unfiltered Dataset will be returned.
*
* URI formats are defined by {@code Dataset} implementations, but must begin
* with "dataset:" or "view:".
*
* @param uriString a {@code Dataset} or {@code View} URI.
* @param The type of {@code View} expected.
* @return a {@code View} for the given URI.
*/
public static > V load(String uriString) {
return Datasets.load(
uriString, GenericRecord.class);
}
/**
* Create a {@link Dataset} for the given dataset or view URI.
*
* URI formats are defined by {@code Dataset} implementations, but must begin
* with "dataset:" or "view:".
*
* @param uri a {@code Dataset} or {@code View} URI.
* @param type the Java type of the entities in the dataset
* @param The type of entities stored in the {@code Dataset}.
* @param The type of {@code Dataset} or {@code View} expected.
* @return a newly created {@code Dataset} responsible for the given URI.
*/
@SuppressWarnings("unchecked")
public static > V create(URI uri, DatasetDescriptor descriptor, Class type) {
boolean isView = URIBuilder.VIEW_SCHEME.equals(uri.getScheme());
Preconditions.checkArgument(isView ||
URIBuilder.DATASET_SCHEME.equals(uri.getScheme()),
"Not a dataset or view URI: " + uri);
Preconditions.checkNotNull(type,
"The entity type can't be null, use Object.class to have the type"
+ " determined by the schema.");
Pair> pair =
Registration.lookupDatasetUri(URI.create(uri.getRawSchemeSpecificPart()));
DatasetRepository repo = pair.first();
Map uriOptions = pair.second();
if (descriptor.getLocation() == null && uriOptions.containsKey("location")) {
descriptor = new DatasetDescriptor.Builder(descriptor)
.location(uriOptions.get("location"))
.build();
}
Dataset dataset = repo.create(
uriOptions.get(URIBuilder.NAMESPACE_OPTION),
uriOptions.get(URIBuilder.DATASET_NAME_OPTION), descriptor, type);
if (isView) {
return Datasets. view(dataset, uriOptions);
} else {
return (V) dataset;
}
}
/**
* Create a {@link Dataset} for the given dataset or view URI.
*
* URI formats are defined by {@code Dataset} implementations, but must begin
* with "dataset:" or "view:".
*
* @param uri a {@code Dataset} or {@code View} URI.
* @param The type of {@code Dataset} or {@code View} expected.
* @return a newly created {@code Dataset} responsible for the given URI.
*/
@SuppressWarnings("unchecked")
public static > V create(URI uri, DatasetDescriptor descriptor) {
return Datasets.create(
uri, descriptor, GenericRecord.class);
}
/**
* Create a {@link Dataset} for the given dataset or view URI string.
*
* URI formats are defined by {@code Dataset} implementations, but must begin
* with "dataset:" or "view:".
*
* @param uri a {@code Dataset} or {@code View} URI string.
* @param type the Java type of the entities in the dataset
* @param The type of entities stored in the {@code Dataset}.
* @param The type of {@code Dataset} or {@code View} expected.
* @return a newly created {@code Dataset} responsible for the given URI.
*/
public static > V create(String uri, DatasetDescriptor descriptor, Class type) {
return Datasets. create(URI.create(uri), descriptor, type);
}
/**
* Create a {@link Dataset} for the given dataset or view URI string.
*
* URI formats are defined by {@code Dataset} implementations, but must begin
* with "dataset:" or "view:".
*
* @param uri a {@code Dataset} or {@code View} URI string.
* @param The type of {@code Dataset} or {@code View} expected.
* @return a newly created {@code Dataset} responsible for the given URI.
*/
@SuppressWarnings("unchecked")
public static > V create(String uri, DatasetDescriptor descriptor) {
return Datasets.create(
uri, descriptor, GenericRecord.class);
}
/**
* Update a {@link Dataset} for the given dataset or view URI.
*
* URI formats are defined by {@code Dataset} implementations, but must begin
* with "dataset:" or "view:".
*
* @param uri a {@code Dataset} or {@code View} URI.
* @param type the Java type of the entities in the dataset
* @param The type of entities stored in the {@code Dataset}.
* @param The type of {@code Dataset} expected.
* @return a newly created {@code Dataset} responsible for the given URI.
*/
@SuppressWarnings("unchecked")
public static > D update(
URI uri, DatasetDescriptor descriptor, Class type) {
Preconditions.checkArgument(
URIBuilder.DATASET_SCHEME.equals(uri.getScheme()),
"Not a dataset or view URI: " + uri);
Preconditions.checkNotNull(type,
"The entity type can't be null, use Object.class to have the type"
+ " determined by the schema.");
Pair> pair =
Registration.lookupDatasetUri(URI.create(uri.getRawSchemeSpecificPart()));
DatasetRepository repo = pair.first();
Map uriOptions = pair.second();
return (D) repo.update(
uriOptions.get(URIBuilder.NAMESPACE_OPTION),
uriOptions.get(URIBuilder.DATASET_NAME_OPTION), descriptor, type);
}
/**
* Update a {@link Dataset} for the given dataset or view URI.
*
* URI formats are defined by {@code Dataset} implementations, but must begin
* with "dataset:" or "view:".
*
* @param uri a {@code Dataset} or {@code View} URI.
* @param The type of {@code Dataset} expected.
* @return a newly created {@code Dataset} responsible for the given URI.
*/
@SuppressWarnings("unchecked")
public static > D update(
URI uri, DatasetDescriptor descriptor) {
return Datasets.update(
uri, descriptor, GenericRecord.class);
}
/**
* Update a {@link Dataset} for the given dataset or view URI string.
*
* URI formats are defined by {@code Dataset} implementations, but must begin
* with "dataset:" or "view:".
*
* @param uri a {@code Dataset} or {@code View} URI string.
* @param type the Java type of the entities in the dataset
* @param The type of entities stored in the {@code Dataset}.
* @param The type of {@code Dataset} expected.
* @return a newly created {@code Dataset} responsible for the given URI.
*/
public static > D update(String uri, DatasetDescriptor descriptor, Class type) {
return Datasets. update(URI.create(uri), descriptor, type);
}
/**
* Update a {@link Dataset} for the given dataset or view URI string.
*
* URI formats are defined by {@code Dataset} implementations, but must begin
* with "dataset:" or "view:".
*
* @param uri a {@code Dataset} or {@code View} URI string.
* @param The type of {@code Dataset} expected.
* @return a newly created {@code Dataset} responsible for the given URI.
*/
public static > D update(String uri, DatasetDescriptor descriptor) {
return Datasets.update(
uri, descriptor, GenericRecord.class);
}
/**
* Delete a {@link Dataset} identified by the given dataset URI.
*
* URI formats are defined by {@code Dataset} implementations, but must begin
* with "dataset:".
*
* @param uri a {@code Dataset} URI.
* @return {@code true} if any data or metadata was removed, or {@code false}
*/
public static boolean delete(URI uri) {
Preconditions.checkArgument(
URIBuilder.DATASET_SCHEME.equals(uri.getScheme()),
"Not a dataset URI: " + uri);
Pair> pair =
Registration.lookupDatasetUri(URI.create(uri.getRawSchemeSpecificPart()));
DatasetRepository repo = pair.first();
Map uriOptions = pair.second();
return repo.delete(
uriOptions.get(URIBuilder.NAMESPACE_OPTION),
uriOptions.get(URIBuilder.DATASET_NAME_OPTION));
}
/**
* Delete a {@link Dataset} identified by the given dataset URI string.
*
* URI formats are defined by {@code Dataset} implementations, but must begin
* with "dataset:".
*
* @param uri a {@code Dataset} URI string.
* @return {@code true} if any data or metadata was removed, or {@code false}
*/
public static boolean delete(String uri) {
return delete(URI.create(uri));
}
/**
* Check if a {@link Dataset} identified by the given URI exists.
*
* URI formats are defined by {@code Dataset} implementations, but must begin
* with "dataset:".
*
* @param uri a {@code Dataset} URI.
* @return {@code true} if the dataset exists, {@code false} otherwise
*/
public static boolean exists(URI uri) {
Preconditions.checkArgument(
URIBuilder.DATASET_SCHEME.equals(uri.getScheme()),
"Not a dataset URI: " + uri);
Pair> pair =
Registration.lookupDatasetUri(URI.create(uri.getRawSchemeSpecificPart()));
DatasetRepository repo = pair.first();
Map uriOptions = pair.second();
return repo.exists(
uriOptions.get(URIBuilder.NAMESPACE_OPTION),
uriOptions.get(URIBuilder.DATASET_NAME_OPTION));
}
/**
* Check if a {@link Dataset} identified by the given URI string exists.
*
* URI formats are defined by {@code Dataset} implementations, but must begin
* with "dataset:".
*
* @param uri a {@code Dataset} URI string.
* @return {@code true} if the dataset exists, {@code false} otherwise
*/
public static boolean exists(String uri) {
return exists(URI.create(uri));
}
/**
* List the {@link Dataset} URIs in the repository identified by the URI
*
* URI formats are defined by {@code Dataset} implementations. The repository
* URIs passed to this method must begin with "repo:".
*
* @param uri a {@code DatasetRepository} URI
* @return the URIs present in the {@code DatasetRepository}
*/
public static Collection list(URI uri) {
boolean isRepo = URIBuilder.REPO_SCHEME.equals(uri.getScheme());
Preconditions.checkArgument(isRepo, "Not a repository URI: " + uri);
DatasetRepository repo = Registration
.open(URI.create(uri.getRawSchemeSpecificPart()));
// build a URI for each dataset name
URI repoUri = repo.getUri();
List datasets = Lists.newArrayList();
for (String namespace : repo.namespaces()) {
for (String dataset : repo.datasets(namespace)) {
datasets.add(new URIBuilder(repoUri, namespace, dataset).build());
}
}
return datasets;
}
/**
* List the {@link Dataset} URIs in the repository identified by the URI string
*
* URI formats are defined by {@code Dataset} implementations. The repository
* URIs passed to this method must begin with "repo:".
*
* @param uri a {@code DatasetRepository} URI string
* @return the URIs present in the {@code DatasetRepository}
*/
public static Collection list(String uri) {
return list(URI.create(uri));
}
@SuppressWarnings("unchecked")
private static > V view(Dataset dataset,
Map uriOptions) {
if (dataset instanceof AbstractDataset) {
DatasetDescriptor descriptor = dataset.getDescriptor();
Schema schema = descriptor.getSchema();
PartitionStrategy strategy = null;
if (descriptor.isPartitioned()) {
strategy = descriptor.getPartitionStrategy();
}
Constraints constraints = Constraints.fromQueryMap(
schema, strategy, uriOptions);
return (V) ((AbstractDataset) dataset).filter(constraints);
} else {
return (V) dataset;
}
}
}