
com.google.cloud.dataflow.sdk.transforms.RemoveDuplicates Maven / Gradle / Ivy
/*
* Copyright (C) 2015 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package com.google.cloud.dataflow.sdk.transforms;
import com.google.cloud.dataflow.sdk.values.KV;
import com.google.cloud.dataflow.sdk.values.PCollection;
/**
* {@code RemoveDuplicates} takes a {@code PCollection} and
* returns a {@code PCollection} that has all the elements of the
* input but with duplicate elements removed such that each element is
* unique within each window.
*
* Two values of type {@code T} are compared for equality not by
* regular Java {@link Object#equals}, but instead by first encoding
* each of the elements using the {@code PCollection}'s {@code Coder}, and then
* comparing the encoded bytes. This admits efficient parallel
* evaluation.
*
*
Optionally, a function may be provided that maps each element to a representative
* value. In this case, two elements will be considered duplicates if they have equal
* representative values, with equality being determined as above.
*
*
By default, the {@code Coder} of the output {@code PCollection}
* is the same as the {@code Coder} of the input {@code PCollection}.
*
*
Each output element is in the same window as its corresponding input
* element, and has the timestamp of the end of that window. The output
* {@code PCollection} has the same
* {@link com.google.cloud.dataflow.sdk.transforms.windowing.WindowFn}
* as the input.
*
*
Does not preserve any order the input PCollection might have had.
*
*
Example of use:
*
{@code
* PCollection words = ...;
* PCollection uniqueWords =
* words.apply(RemoveDuplicates.create());
* }
*
* @param the type of the elements of the input and output
* {@code PCollection}s
*/
@SuppressWarnings("serial")
public class RemoveDuplicates extends PTransform,
PCollection> {
/**
* Returns a {@code RemoveDuplicates} {@code PTransform}.
*
* @param the type of the elements of the input and output
* {@code PCollection}s
*/
public static RemoveDuplicates create() {
return new RemoveDuplicates();
}
/**
* Returns a {@code RemoveDuplicates} {@code PTransform}.
*
* @param the type of the elements of the input and output
* {@code PCollection}s
* @param the type of the representative value used to dedup
*/
public static WithRepresentativeValues withRepresentativeValueFn(
SerializableFunction fn) {
return new WithRepresentativeValues(fn);
}
@Override
public PCollection apply(PCollection in) {
return in
.apply(ParDo.named("CreateIndex")
.of(new DoFn>() {
@Override
public void processElement(ProcessContext c) {
c.output(KV.of(c.element(), (Void) null));
}
}))
.apply(Combine.perKey(
new SerializableFunction, Void>() {
@Override
public Void apply(Iterable iter) {
return null; // ignore input
}
}))
.apply(Keys.create());
}
private static class WithRepresentativeValues
extends PTransform, PCollection> {
private SerializableFunction fn;
private WithRepresentativeValues(SerializableFunction fn) {
this.fn = fn;
}
@Override
public PCollection apply(PCollection in) {
return in
.apply(WithKeys.of(fn))
.apply(Combine.perKey(
new Combine.BinaryCombineFn() {
@Override
public T apply(T left, T right) {
return left;
}
}))
.apply(Values.create());
}
}
}