All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.cloud.dataflow.sdk.transforms.FlatMapElements Maven / Gradle / Ivy

Go to download

Google Cloud Dataflow Java SDK provides a simple, Java-based interface for processing virtually any size data using Google cloud resources. This artifact includes entire Dataflow Java SDK.

There is a newer version: 2.5.0
Show newest version
/*
 * Copyright (C) 2015 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package com.google.cloud.dataflow.sdk.transforms;

import com.google.cloud.dataflow.sdk.values.PCollection;
import com.google.cloud.dataflow.sdk.values.TypeDescriptor;

import java.lang.reflect.ParameterizedType;

/**
 * {@code PTransform}s for mapping a simple function that returns iterables over the elements of a
 * {@link PCollection} and merging the results.
 */
public class FlatMapElements
extends PTransform, PCollection> {
  /**
   * For a {@code SerializableFunction>} {@code fn},
   * returns a {@link PTransform} that applies {@code fn} to every element of the input
   * {@code PCollection} and outputs all of the elements to the output
   * {@code PCollection}.
   *
   * 

Example of use in Java 8: *

{@code
   * PCollection words = lines.apply(
   *     FlatMapElements.via((String line) -> Arrays.asList(line.split(" ")))
   *         .withOutputType(new TypeDescriptor(){});
   * }
* *

In Java 7, the overload {@link #via(SimpleFunction)} is more concise as the output type * descriptor need not be provided. */ public static MissingOutputTypeDescriptor via(SerializableFunction> fn) { return new MissingOutputTypeDescriptor<>(fn); } /** * For a {@code SimpleFunction>} {@code fn}, * return a {@link PTransform} that applies {@code fn} to every element of the input * {@code PCollection} and outputs all of the elements to the output * {@code PCollection}. * *

This overload is intended primarily for use in Java 7. In Java 8, the overload * {@link #via(SerializableFunction)} supports use of lambda for greater concision. * *

Example of use in Java 7: *

{@code
   * PCollection lines = ...;
   * PCollection words = lines.apply(FlatMapElements.via(
   *     new SimpleFunction>() {
   *       public Integer apply(String line) {
   *         return Arrays.asList(line.split(" "));
   *       }
   *     });
   * }
* *

To use a Java 8 lambda, see {@link #via(SerializableFunction)}. */ public static FlatMapElements via(SimpleFunction> fn) { @SuppressWarnings({"rawtypes", "unchecked"}) // safe by static typing TypeDescriptor> iterableType = (TypeDescriptor) fn.getOutputTypeDescriptor(); @SuppressWarnings("unchecked") // safe by correctness of getIterableElementType TypeDescriptor outputType = (TypeDescriptor) getIterableElementType(iterableType); return new FlatMapElements<>(fn, outputType); } /** * An intermediate builder for a {@link FlatMapElements} transform. To complete the transform, * provide an output type descriptor to {@link MissingOutputTypeDescriptor#withOutputType}. See * {@link #via(SerializableFunction)} for a full example of use. */ public static final class MissingOutputTypeDescriptor { private final SerializableFunction> fn; private MissingOutputTypeDescriptor( SerializableFunction> fn) { this.fn = fn; } public FlatMapElements withOutputType(TypeDescriptor outputType) { return new FlatMapElements<>(fn, outputType); } } private static TypeDescriptor getIterableElementType( TypeDescriptor> iterableTypeDescriptor) { // If a rawtype was used, the type token may be for Object, not a subtype of Iterable. // In this case, we rely on static typing of the function elsewhere to ensure it is // at least some kind of iterable, and grossly overapproximate the element type to be Object. if (!iterableTypeDescriptor.isSubtypeOf(new TypeDescriptor>() {})) { return new TypeDescriptor() {}; } // Otherwise we can do the proper thing and get the actual type parameter. ParameterizedType iterableType = (ParameterizedType) iterableTypeDescriptor.getSupertype(Iterable.class).getType(); return TypeDescriptor.of(iterableType.getActualTypeArguments()[0]); } ////////////////////////////////////////////////////////////////////////////////////////////////// private final SerializableFunction> fn; private final transient TypeDescriptor outputType; private FlatMapElements( SerializableFunction> fn, TypeDescriptor outputType) { this.fn = fn; this.outputType = outputType; } @Override public PCollection apply(PCollection input) { return input.apply(ParDo.named("Map").of(new DoFn() { private static final long serialVersionUID = 0L; @Override public void processElement(ProcessContext c) { for (OutputT element : fn.apply(c.element())) { c.output(element); } } })).setTypeDescriptorInternal(outputType); } }