All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hazelcast.jet.core.Edge Maven / Gradle / Ivy

There is a newer version: 4.5.4
Show newest version
/*
 * Copyright (c) 2008-2018, Hazelcast, Inc. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.hazelcast.jet.core;

import com.hazelcast.jet.config.EdgeConfig;
import com.hazelcast.jet.function.DistributedFunction;
import com.hazelcast.jet.impl.MasterContext;
import com.hazelcast.jet.impl.SerializationConstants;
import com.hazelcast.jet.impl.execution.init.CustomClassLoadedObject;
import com.hazelcast.nio.ObjectDataInput;
import com.hazelcast.nio.ObjectDataOutput;
import com.hazelcast.nio.serialization.IdentifiedDataSerializable;
import com.hazelcast.util.UuidUtil;

import javax.annotation.Nonnull;
import java.io.IOException;
import java.io.Serializable;
import java.util.Map;

import static com.hazelcast.jet.core.Partitioner.defaultPartitioner;
import static com.hazelcast.jet.function.DistributedFunctions.wholeItem;
import static com.hazelcast.jet.impl.util.Util.checkSerializable;

/**
 * Represents an edge between two {@link Vertex vertices} in a {@link DAG}.
 * Conceptually, data travels over the edge from the source vertex to the
 * destination vertex. Practically, since the vertex is distributed across
 * the cluster and across threads in each cluster member, the edge is
 * implemented by a number of concurrent queues and network sender/receiver
 * pairs.
 * 

* It is often desirable to arrange that all items belonging to the same * collation key are received by the same processing unit (instance of * {@link Processor}). This is achieved by configuring an appropriate * {@link Partitioner} on the edge. The partitioner will determine the * partition ID of each item and all items with the same partition ID will * be routed to the same {@code Processor} instance. Depending on the value * of edge's distributed property, the processor will be unique * cluster-wide, or only within each member. *

* A newly instantiated Edge is non-distributed with a {@link * RoutingPolicy#UNICAST UNICAST} routing policy. */ public class Edge implements IdentifiedDataSerializable { private Vertex source; // transient field, restored during DAG deserialization private String sourceName; private int sourceOrdinal; private Vertex destination; // transient field, restored during DAG deserialization private String destName; private int destOrdinal; private int priority; private boolean isDistributed; private Partitioner partitioner; private RoutingPolicy routingPolicy = RoutingPolicy.UNICAST; private EdgeConfig config; protected Edge() { } protected Edge(@Nonnull Vertex source, int sourceOrdinal, Vertex destination, int destOrdinal) { this.source = source; this.sourceName = source.getName(); this.sourceOrdinal = sourceOrdinal; this.destination = destination; this.destName = destination != null ? destination.getName() : null; this.destOrdinal = destOrdinal; } /** * Returns an edge between two vertices. The ordinal of the edge * is 0 at both ends. Equivalent to {@code from(source).to(destination)}. * * @param source the source vertex * @param destination the destination vertex */ @Nonnull public static Edge between(@Nonnull Vertex source, @Nonnull Vertex destination) { return new Edge(source, 0, destination, 0); } /** * Returns an edge with the given source vertex and no destination vertex. * The ordinal of the edge is 0. Typically followed by one of the * {@code to()} method calls. */ @Nonnull public static Edge from(@Nonnull Vertex source) { return from(source, 0); } /** * Returns an edge with the given source vertex at the given ordinal * and no destination vertex. Typically follewed by a call to one of * the {@code to()} methods. */ @Nonnull public static Edge from(@Nonnull Vertex source, int ordinal) { return new Edge(source, ordinal, null, 0); } /** * Sets the destination vertex of this edge, with ordinal 0. */ @Nonnull public Edge to(@Nonnull Vertex destination) { this.destination = destination; this.destName = destination.getName(); return this; } /** * Sets the destination vertex and ordinal of this edge. */ @Nonnull public Edge to(@Nonnull Vertex destination, int ordinal) { this.destination = destination; this.destName = destination.getName(); this.destOrdinal = ordinal; return this; } /** * Returns this edge's source vertex. */ @Nonnull public Vertex getSource() { return source; } /** * Returns this edge's destination vertex. */ public Vertex getDestination() { return destination; } /** * Returns the name of the source vertex. */ @Nonnull public String getSourceName() { return sourceName; } /** * Returns the ordinal of the edge at the source vertex. */ public int getSourceOrdinal() { return sourceOrdinal; } /** * Returns the name of the destination vertex. */ public String getDestName() { return destName; } /** * Returns the ordinal of the edge at the destination vertex. */ public int getDestOrdinal() { return destOrdinal; } /** * Sets the priority of the edge. A lower number means higher priority * and the default is 0. *

* Example: there two incoming edges on a vertex, with priorities 1 and 2. * The data from the edge with priority 1 will be processed in full before * accepting any data from the edge with priority 2. *

* Note: having different priority edges will cause postponing of * the first snapshot until after upstream vertices of higher priority * edges are completed. * Reason: after receiving a {@link * com.hazelcast.jet.impl.execution.SnapshotBarrier barrier} we stop * processing items on that edge until the barrier is received from all * other edges. However, we also don't process lower priority edges until * higher priority edges are done, which prevents receiving the barrier on * them, which in the end stalls the job indefinitely. Technically this * applies only to {@link * com.hazelcast.jet.config.ProcessingGuarantee#EXACTLY_ONCE EXACTLY_ONCE} * snapshot mode, but the snapshot is also postponed for {@link * com.hazelcast.jet.config.ProcessingGuarantee#AT_LEAST_ONCE * AT_LEAST_ONCE} jobs, because the snapshot won't complete until after all * higher priority edges are completed and will increase the number of * duplicately processed items. */ @Nonnull public Edge priority(int priority) { if (priority == MasterContext.SNAPSHOT_RESTORE_EDGE_PRIORITY) { throw new IllegalArgumentException("priority must not be Integer.MIN_VALUE (" + MasterContext.SNAPSHOT_RESTORE_EDGE_PRIORITY + ')'); } this.priority = priority; return this; } /** * Returns the value of edge's priority, as explained on * {@link #priority(int)}. */ public int getPriority() { return priority; } /** * Activates the {@link RoutingPolicy#PARTITIONED PARTITIONED} routing * policy and applies the {@link Partitioner#defaultPartitioner() default} * Hazelcast partitioning strategy. The strategy is applied to the result of * the {@code extractKeyFn} function. */ @Nonnull public Edge partitioned(@Nonnull DistributedFunction extractKeyFn) { return partitioned(extractKeyFn, defaultPartitioner()); } /** * Activates the {@link RoutingPolicy#PARTITIONED PARTITIONED} routing * policy and applies the provided partitioning strategy. The strategy * is applied to the result of the {@code extractKeyFn} function. */ @Nonnull public Edge partitioned( @Nonnull DistributedFunction extractKeyFn, @Nonnull Partitioner partitioner ) { checkSerializable(extractKeyFn, "extractKeyFn"); checkSerializable(partitioner, "partitioner"); this.routingPolicy = RoutingPolicy.PARTITIONED; this.partitioner = new KeyPartitioner<>(extractKeyFn, partitioner); return this; } /** * Activates a special-cased {@link RoutingPolicy#PARTITIONED PARTITIONED} * routing policy where all items will be assigned the same, randomly * chosen partition ID. Therefore all items will be directed to the same * processor. */ @Nonnull public Edge allToOne() { return partitioned(wholeItem(), new Single()); } /** * Activates the {@link RoutingPolicy#BROADCAST BROADCAST} routing policy. */ @Nonnull public Edge broadcast() { routingPolicy = RoutingPolicy.BROADCAST; return this; } /** * Activates the {@link RoutingPolicy#ISOLATED ISOLATED} routing policy * which establishes isolated paths from upstream to downstream processors. * Each downstream processor is assigned exactly one upstream processor and * each upstream processor is assigned a disjoint subset of downstream * processors. This allows the selective application of backpressure to * just one source processor that feeds a given downstream processor. *

* These restrictions imply that the downstream's local parallelism * cannot be less than upstream's. Since all traffic will be local, this * policy is not allowed on a distributed edge. */ @Nonnull public Edge isolated() { routingPolicy = RoutingPolicy.ISOLATED; return this; } /** * Returns the instance encapsulating the partitioning strategy in effect * on this edge. */ public Partitioner getPartitioner() { return partitioner; } /** * Returns the {@link RoutingPolicy} in effect on the edge. */ @Nonnull public RoutingPolicy getRoutingPolicy() { return routingPolicy; } /** * Declares that the edge is distributed. A non-distributed edge only * transfers data within the same member. If the data source running on * local member is distributed (produces only a slice of all the data on * any given member), the local processors will not observe all the data. * The same holds true when the data originates from an upstream * distributed edge. *

* A distributed edge allows all the data to be observed by all * the processors (using the {@link RoutingPolicy#BROADCAST BROADCAST} * routing policy) and, more attractively, all the data with a given * partition ID to be observed by the same unique processor, regardless of * whether it is running on the local or a remote member (using the {@link * RoutingPolicy#PARTITIONED PARTITIONED} routing policy). */ public Edge distributed() { isDistributed = true; return this; } /** * Says whether this edge is distributed. The effects of this * property are discussed in {@link #distributed()}. */ public boolean isDistributed() { return isDistributed; } /** * Returns the {@code EdgeConfig} instance associated with this edge. */ public EdgeConfig getConfig() { return config; } /** * Assigns an {@code EdgeConfig} to this edge. */ public Edge setConfig(EdgeConfig config) { this.config = config; return this; } @Nonnull @Override public String toString() { final StringBuilder b = new StringBuilder(); if (sourceOrdinal == 0 && destOrdinal == 0) { b.append("between(\"").append(sourceName).append("\", \"").append(destName).append("\")"); } else { b.append("from(\"").append(sourceName).append('"'); if (sourceOrdinal != 0) { b.append(", ").append(sourceOrdinal); } b.append(").to(\"").append(destName).append('"'); if (destOrdinal != 0) { b.append(", ").append(destOrdinal); } b.append(')'); } switch (getRoutingPolicy()) { case UNICAST: break; case ISOLATED: b.append(".isolated()"); break; case PARTITIONED: b.append(getPartitioner() instanceof Single ? ".allToOne()" : ".partitioned(?)"); break; case BROADCAST: b.append(".broadcast()"); break; default: } if (isDistributed()) { b.append(".distributed()"); } if (getPriority() != 0) { b.append(".priority(").append(getPriority()).append(')'); } return b.toString(); } @Override public boolean equals(Object obj) { final Edge that; return this == obj || obj instanceof Edge && this.sourceName.equals((that = (Edge) obj).sourceName) && this.destName.equals(that.destName); } @Override public int hashCode() { return 37 * sourceName.hashCode() + destName.hashCode(); } void restoreSourceAndDest(Map nameToVertex) { source = nameToVertex.get(sourceName); destination = nameToVertex.get(destName); assert source != null : "Couldn't restore source vertex " + sourceName + " from map " + nameToVertex; assert destination != null : "Couldn't restore destination vertex " + destName + " from map " + nameToVertex; } // Implementation of IdentifiedDataSerializable @Override public void writeData(@Nonnull ObjectDataOutput out) throws IOException { out.writeUTF(getSourceName()); out.writeInt(getSourceOrdinal()); out.writeUTF(getDestName()); out.writeInt(getDestOrdinal()); out.writeInt(getPriority()); out.writeBoolean(isDistributed()); out.writeObject(getRoutingPolicy()); CustomClassLoadedObject.write(out, getPartitioner()); out.writeObject(getConfig()); } @Override public void readData(@Nonnull ObjectDataInput in) throws IOException { sourceName = in.readUTF(); sourceOrdinal = in.readInt(); destName = in.readUTF(); destOrdinal = in.readInt(); priority = in.readInt(); isDistributed = in.readBoolean(); routingPolicy = in.readObject(); partitioner = CustomClassLoadedObject.read(in); config = in.readObject(); } @Override public int getFactoryId() { return SerializationConstants.FACTORY_ID; } @Override public int getId() { return SerializationConstants.EDGE; } // END Implementation of IdentifiedDataSerializable /** * An edge describes a connection from many upstream processors to many * downstream processors. The routing policy decides where exactly to route * each particular item emitted from an upstream processor. To simplify * the reasoning we introduce the concept of the set of candidate * downstream processors, or the candidate set for short. On * a local edge the candidate set contains only local processors and on a * distributed edge it contain all the processors. */ public enum RoutingPolicy implements Serializable { /** * For each item a single destination processor is chosen from the * candidate set, with no restriction on the choice. */ UNICAST, /** * Like {@link #UNICAST}, but guarantees that any given downstream * processor receives data from exactly one upstream processor. This is * needed in some DAG setups to apply selective backpressure to individual * upstream source processors. *

* The downstream's local parallelism must not be less than the upstream's. * This policy is only available on a local edge. */ ISOLATED, /** * Each item is sent to the one processor responsible for the item's * partition ID. On a distributed edge the processor is unique across the * cluster; on a non-distributed edge the processor is unique only within a * member. */ PARTITIONED, /** * Each item is sent to all candidate processors. */ BROADCAST } private static class Single implements Partitioner { private static final long serialVersionUID = 1L; private final String key; private int partition; Single() { key = UuidUtil.newUnsecureUuidString(); } @Override public void init(DefaultPartitionStrategy strat) { partition = strat.getPartition(key); } @Override public int getPartition(Object item, int partitionCount) { return partition; } } private static final class KeyPartitioner implements Partitioner { private static final long serialVersionUID = 1L; private final DistributedFunction keyExtractor; private final Partitioner partitioner; KeyPartitioner(@Nonnull DistributedFunction keyExtractor, @Nonnull Partitioner partitioner) { this.keyExtractor = keyExtractor; this.partitioner = partitioner; } @Override public void init(DefaultPartitionStrategy strat) { partitioner.init(strat); } @Override public int getPartition(T item, int partitionCount) { return partitioner.getPartition(keyExtractor.apply(item), partitionCount); } } }