All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.cloud.dataflow.sdk.runners.inprocess.UnboundedReadDeduplicator Maven / Gradle / Ivy

Go to download

Google Cloud Dataflow Java SDK provides a simple, Java-based interface for processing virtually any size data using Google cloud resources. This artifact includes entire Dataflow Java SDK.

There is a newer version: 2.5.0
Show newest version
/*
 * Copyright (C) 2016 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package com.google.cloud.dataflow.sdk.runners.inprocess;

import com.google.cloud.dataflow.sdk.coders.ByteArrayCoder;
import com.google.common.cache.CacheBuilder;
import com.google.common.cache.CacheLoader;
import com.google.common.cache.LoadingCache;

import org.joda.time.Duration;

import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;

/**
 * Provides methods to determine if a record is a duplicate within the evaluation of a
 * {@link Unbounded} {@link PTransform}.
 */
interface UnboundedReadDeduplicator {
  /**
   * Returns true if the record with the provided ID should be output, and false if it should not
   * be because it is a duplicate.
   */
  boolean shouldOutput(byte[] recordId);

  /**
   * An {@link UnboundedReadDeduplicator} that always returns true. For use with sources do not
   * require deduplication.
   */
  class NeverDeduplicator implements UnboundedReadDeduplicator {
    /**
     * Create a new {@link NeverDeduplicator}.
     */
    public static UnboundedReadDeduplicator create() {
      return new NeverDeduplicator();
    }

    @Override
    public boolean shouldOutput(byte[] recordId) {
      return true;
    }
  }


  /**
   * An {@link UnboundedReadDeduplicator} that returns true if the record ID has not been seen
   * within 10 minutes.
   */
  class CachedIdDeduplicator implements UnboundedReadDeduplicator {
    private static final ByteArrayCoder RECORD_ID_CODER = ByteArrayCoder.of();
    private static final long MAX_RETENTION_SINCE_ACCESS =
        Duration.standardMinutes(10L).getMillis();

    private final LoadingCache, AtomicBoolean> ids;

    /**
     * Create a new {@link CachedIdDeduplicator}.
     */
    public static UnboundedReadDeduplicator create() {
      return new CachedIdDeduplicator();
    }

    private CachedIdDeduplicator() {
      ids = CacheBuilder.newBuilder()
          .expireAfterAccess(MAX_RETENTION_SINCE_ACCESS, TimeUnit.MILLISECONDS)
          .maximumSize(100_000L)
          .build(new TrueBooleanLoader());
    }

    @Override
    public boolean shouldOutput(byte[] recordId) {
      return ids.getUnchecked(StructuralKey.of(recordId, RECORD_ID_CODER)).getAndSet(false);
    }

    private static class TrueBooleanLoader
        extends CacheLoader, AtomicBoolean> {
      @Override
      public AtomicBoolean load(StructuralKey key) throws Exception {
        return new AtomicBoolean(true);
      }
    }
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy