All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.cloud.dataflow.sdk.transforms.RemoveDuplicates Maven / Gradle / Ivy

/*
 * Copyright (C) 2015 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package com.google.cloud.dataflow.sdk.transforms;

import com.google.cloud.dataflow.sdk.values.KV;
import com.google.cloud.dataflow.sdk.values.PCollection;

/**
 * {@code RemoveDuplicates} takes a {@code PCollection} and
 * returns a {@code PCollection} that has all the elements of the
 * input but with duplicate elements removed such that each element is
 * unique within each window.
 *
 * 

Two values of type {@code T} are compared for equality not by * regular Java {@link Object#equals}, but instead by first encoding * each of the elements using the {@code PCollection}'s {@code Coder}, and then * comparing the encoded bytes. This admits efficient parallel * evaluation. * *

Optionally, a function may be provided that maps each element to a representative * value. In this case, two elements will be considered duplicates if they have equal * representative values, with equality being determined as above. * *

By default, the {@code Coder} of the output {@code PCollection} * is the same as the {@code Coder} of the input {@code PCollection}. * *

Each output element is in the same window as its corresponding input * element, and has the timestamp of the end of that window. The output * {@code PCollection} has the same * {@link com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn} * as the input. * *

Does not preserve any order the input PCollection might have had. * *

Example of use: *

 {@code
 * PCollection words = ...;
 * PCollection uniqueWords =
 *     words.apply(RemoveDuplicates.create());
 * } 
* * @param the type of the elements of the input and output * {@code PCollection}s */ @SuppressWarnings("serial") public class RemoveDuplicates extends PTransform, PCollection> { /** * Returns a {@code RemoveDuplicates} {@code PTransform}. * * @param the type of the elements of the input and output * {@code PCollection}s */ public static RemoveDuplicates create() { return new RemoveDuplicates(); } /** * Returns a {@code RemoveDuplicates} {@code PTransform}. * * @param the type of the elements of the input and output * {@code PCollection}s * @param the type of the representative value used to dedup */ public static WithRepresentativeValues withRepresentativeValueFn( SerializableFunction fn) { return new WithRepresentativeValues(fn); } @Override public PCollection apply(PCollection in) { return in .apply(ParDo.named("CreateIndex") .of(new DoFn>() { @Override public void processElement(ProcessContext c) { c.output(KV.of(c.element(), (Void) null)); } })) .apply(Combine.perKey( new SerializableFunction, Void>() { @Override public Void apply(Iterable iter) { return null; // ignore input } })) .apply(Keys.create()); } private static class WithRepresentativeValues extends PTransform, PCollection> { private SerializableFunction fn; private WithRepresentativeValues(SerializableFunction fn) { this.fn = fn; } @Override public PCollection apply(PCollection in) { return in .apply(WithKeys.of(fn)) .apply(Combine.perKey( new Combine.BinaryCombineFn() { @Override public T apply(T left, T right) { return left; } })) .apply(Values.create()); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy