All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.kitesdk.data.crunch.CrunchDatasets Maven / Gradle / Ivy

/**
 * Copyright 2013 Cloudera Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.kitesdk.data.crunch;

import java.net.URI;
import java.util.List;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.crunch.DoFn;
import org.apache.crunch.Emitter;
import org.apache.crunch.MapFn;
import org.apache.crunch.PCollection;
import org.apache.crunch.PGroupedTable;
import org.apache.crunch.PTable;
import org.apache.crunch.Pair;
import org.apache.crunch.Target;
import org.apache.crunch.io.ReadableSource;
import org.apache.crunch.types.PTableType;
import org.apache.crunch.types.PType;
import org.apache.crunch.types.avro.Avros;
import org.kitesdk.data.Dataset;
import org.kitesdk.data.DatasetDescriptor;
import org.kitesdk.data.PartitionStrategy;
import org.kitesdk.data.View;
import org.kitesdk.data.spi.DataModelUtil;
import org.kitesdk.data.spi.EntityAccessor;
import org.kitesdk.data.spi.FieldPartitioner;
import org.kitesdk.data.spi.PartitionStrategyParser;
import org.kitesdk.data.spi.SchemaUtil;

/**
 * 

* A helper class for exposing {@link Dataset}s and {@link View}s as Crunch * {@link ReadableSource}s or {@link Target}s. *

*/ public class CrunchDatasets { /** * Expose the given {@link View} as a Crunch {@link ReadableSource}. * * @param view the view to read from * @param the type of entity produced by the source * @return a {@link ReadableSource} for the view * * @since 0.14.0 */ public static ReadableSource asSource(View view) { return new DatasetSourceTarget(view); } /** * Expose the {@link View} or {@link Dataset} represented by the URI * as a Crunch {@link ReadableSource}. * * @param uri the URI of the view or dataset to read from * @param type the Java type of the entities in the dataset * @param the type of entity produced by the source * @return a {@link ReadableSource} for the view * * @since 0.15.0 */ public static ReadableSource asSource(URI uri, Class type) { return new DatasetSourceTarget(uri, type); } /** * Expose the {@link View} or {@link Dataset} represented by the URI * as a Crunch {@link ReadableSource}. * * @param uri the URI of the view or dataset to read from * @param type the Java type of the entities in the dataset * @param the type of entity produced by the source * @return a {@link ReadableSource} for the view * * @since 0.15.0 */ public static ReadableSource asSource(String uri, Class type) { return asSource(URI.create(uri), type); } /** * Expose the given {@link View} as a Crunch {@link Target}. * * @param view the view to write to * @param the type of entity stored in the view * @return a {@link Target} for the view * * @since 0.14.0 */ public static Target asTarget(View view) { return new DatasetTarget(view); } /** * Expose the {@link Dataset} or {@link View} represented by the given * URI as a Crunch {@link Target}. * * @param uri the dataset or view URI * @return a {@link Target} for the dataset or view * * @since 0.15.0 */ public static Target asTarget(String uri) { return asTarget(URI.create(uri)); } /** * Expose the {@link Dataset} or {@link View} represented by the given * URI as a Crunch {@link Target}. * * @param uri the dataset or view URI * @return a {@link Target} for the dataset or view * * @since 0.15.0 */ public static Target asTarget(URI uri) { return new DatasetTarget(uri); } /** * Partitions {@code collection} to be stored efficiently in {@code View}. *

* This restructures the parallel collection so that all of the entities that * will be stored in a given partition will be processed by the same writer. * * @param collection a collection of entities * @param view a {@link View} of a dataset to partition the collection for * @param the type of entities in the collection and underlying dataset * @return an equivalent collection of entities partitioned for the view * * @since 0.16.0 */ public static PCollection partition(PCollection collection, View view) { return partition(collection, view.getDataset(), -1); } /** * Partitions {@code collection} to be stored efficiently in {@code dataset}. *

* This restructures the parallel collection so that all of the entities that * will be stored in a given partition will be processed by the same writer. * * @param collection a collection of entities * @param dataset a dataset to partition the collection for * @param the type of entities in the collection and underlying dataset * @return an equivalent collection of entities partitioned for the view * * @since 0.16.0 */ public static PCollection partition(PCollection collection, Dataset dataset) { return partition(collection, dataset, -1); } /** * Partitions {@code collection} to be stored efficiently in {@code View}. *

* This restructures the parallel collection so that all of the entities that * will be stored in a given partition will be processed by the same writer. *

* If the dataset is not partitioned, then this will structure all of the * entities to produce a number of files equal to {@code numWriters}. * * @param collection a collection of entities * @param view a {@link View} of a dataset to partition the collection for * @param numWriters the number of writers that should be used * @param the type of entities in the collection and underlying dataset * @return an equivalent collection of entities partitioned for the view * @see #partition(PCollection, View) * * @since 0.16.0 */ public static PCollection partition(PCollection collection, View view, int numWriters) { return partition(collection, view.getDataset().getDescriptor(), view.getType(), numWriters); } /** * Partitions {@code collection} to be stored efficiently in {@code dataset}. *

* This restructures the parallel collection so that all of the entities that * will be stored in a given partition will be processed by the same writer. *

* If the dataset is not partitioned, then this will structure all of the * entities to produce a number of files equal to {@code numWriters}. * * @param collection a collection of entities * @param dataset a dataset to partition the collection for * @param numWriters the number of writers that should be used * @param the type of entities in the collection and underlying dataset * @return an equivalent collection of entities partitioned for the view * @see #partition(PCollection, Dataset) * * @since 0.16.0 */ public static PCollection partition(PCollection collection, Dataset dataset, int numWriters) { return partition(collection, dataset.getDescriptor(), dataset.getType(), numWriters); } private static PCollection partition(PCollection collection, DatasetDescriptor descriptor, Class type, int numReducers) { if (descriptor.isPartitioned()) { GetStorageKey getKey = new GetStorageKey( descriptor.getPartitionStrategy(), descriptor.getSchema(), type); PTable table = collection .by(getKey, Avros.generics(getKey.schema())); PGroupedTable grouped = numReducers > 0 ? table.groupByKey(numReducers) : table.groupByKey(); return grouped.ungroup().values(); } else { return partition(collection, numReducers); } } private static PCollection partition(PCollection collection, int numReducers) { PType type = collection.getPType(); PTableType tableType = Avros.tableOf(type, Avros.nulls()); PTable table = collection.parallelDo(new AsKeyTable(), tableType); PGroupedTable grouped = numReducers > 0 ? table.groupByKey(numReducers) : table.groupByKey(); return grouped.ungroup().keys(); } @edu.umd.cs.findbugs.annotations.SuppressWarnings( value="SE_NO_SERIALVERSIONID", justification="Purposely not supported across versions") private static class AsKeyTable extends DoFn> { @Override public void process(E entity, Emitter> emitter) { emitter.emit(Pair.of(entity, (Void) null)); } } @edu.umd.cs.findbugs.annotations.SuppressWarnings( value="SE_NO_SERIALVERSIONID", justification="Purposely not supported across versions") private static class GetStorageKey extends MapFn { private final String strategyString; private final String schemaString; private final Class type; private transient AvroStorageKey key = null; private transient EntityAccessor accessor = null; private GetStorageKey(PartitionStrategy strategy, Schema schema, Class type) { this.strategyString = strategy.toString(false /* no white space */); this.schemaString = schema.toString(false /* no white space */); this.type = type; } public Schema schema() { initialize(); // make sure the key is not null return key.getSchema(); } @Override public void initialize() { if (key == null) { PartitionStrategy strategy = PartitionStrategyParser.parse(strategyString); Schema schema = new Schema.Parser().parse(schemaString); this.key = new AvroStorageKey(strategy, schema); this.accessor = DataModelUtil.accessor(type, schema); } } @Override public AvroStorageKey map(E entity) { return key.reuseFor(entity, accessor); } } @edu.umd.cs.findbugs.annotations.SuppressWarnings( value="EQ_DOESNT_OVERRIDE_EQUALS", justification="StorageKey equals is correct, compares the values") private static class AvroStorageKey extends GenericData.Record { private final PartitionStrategy strategy; private AvroStorageKey(PartitionStrategy strategy, Schema schema) { super(SchemaUtil.keySchema(schema, strategy)); this.strategy = strategy; } @SuppressWarnings({"unchecked", "deprecation"}) public AvroStorageKey reuseFor(E entity, EntityAccessor accessor) { List partitioners = strategy.getFieldPartitioners(); for (int i = 0; i < partitioners.size(); i++) { FieldPartitioner fp = partitioners.get(i); put(i, fp.apply(accessor.get(entity, fp.getSourceName()))); } return this; } } }