org.apache.flink.streaming.api.transformations.StreamTransformation Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of flink-streaming-java_2.11 Show documentation
There is a newer version: 1.14.6
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.streaming.api.transformations;

import org.apache.flink.annotation.Internal;
import org.apache.flink.api.common.ExecutionConfig;
import org.apache.flink.api.common.functions.InvalidTypesException;
import org.apache.flink.api.common.operators.ResourceSpec;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.java.typeutils.MissingTypeInfo;
import org.apache.flink.streaming.api.graph.StreamGraph;
import org.apache.flink.streaming.api.graph.StreamGraphGenerator;
import org.apache.flink.streaming.api.operators.ChainingStrategy;
import org.apache.flink.util.Preconditions;

import java.util.Collection;

import static org.apache.flink.util.Preconditions.checkArgument;
import static org.apache.flink.util.Preconditions.checkNotNull;

/**
 * A {@code StreamTransformation} represents the operation that creates a
 * {@link org.apache.flink.streaming.api.datastream.DataStream}. Every
 * {@link org.apache.flink.streaming.api.datastream.DataStream} has an underlying
 * {@code StreamTransformation} that is the origin of said DataStream.
 *
 * API operations such as {@link org.apache.flink.streaming.api.datastream.DataStream#map} create
 * a tree of {@code StreamTransformation}s underneath. When the stream program is to be executed
 * this graph is translated to a {@link StreamGraph} using
 * {@link org.apache.flink.streaming.api.graph.StreamGraphGenerator}.
 *
 * 
A {@code StreamTransformation} does not necessarily correspond to a physical operation
 * at runtime. Some operations are only logical concepts. Examples of this are union,
 * split/select data stream, partitioning.
 *
 * 
The following graph of {@code StreamTransformations}:
 * 
{@code
 *   Source              Source
 *      +                   +
 *      |                   |
 *      v                   v
 *  Rebalance          HashPartition
 *      +                   +
 *      |                   |
 *      |                   |
 *      +------>Union<------+
 *                +
 *                |
 *                v
 *              Split
 *                +
 *                |
 *                v
 *              Select
 *                +
 *                v
 *               Map
 *                +
 *                |
 *                v
 *              Sink
 * }
 *
 * Would result in this graph of operations at runtime:
 * 
{@code
 *  Source              Source
 *    +                   +
 *    |                   |
 *    |                   |
 *    +------->Map<-------+
 *              +
 *              |
 *              v
 *             Sink
 * }
 *
 * The information about partitioning, union, split/select end up being encoded in the edges
 * that connect the sources to the map operation.
 *
 * @param  The type of the elements that result from this {@code StreamTransformation}
 */
@Internal
public abstract class StreamTransformation {

	// This is used to assign a unique ID to every StreamTransformation
	protected static Integer idCounter = 0;

	public static int getNewNodeId() {
		idCounter++;
		return idCounter;
	}


	protected final int id;

	protected String name;

	protected TypeInformation outputType;
	// This is used to handle MissingTypeInfo. As long as the outputType has not been queried
	// it can still be changed using setOutputType(). Afterwards an exception is thrown when
	// trying to change the output type.
	protected boolean typeUsed;

	private int parallelism;

	/**
	 * The maximum parallelism for this stream transformation. It defines the upper limit for
	 * dynamic scaling and the number of key groups used for partitioned state.
	 */
	private int maxParallelism = -1;

	/**
	 *  The minimum resources for this stream transformation. It defines the lower limit for
	 *  dynamic resources resize in future plan.
	 */
	private ResourceSpec minResources = ResourceSpec.DEFAULT;

	/**
	 *  The preferred resources for this stream transformation. It defines the upper limit for
	 *  dynamic resource resize in future plan.
	 */
	private ResourceSpec preferredResources = ResourceSpec.DEFAULT;

	/**
	 * User-specified ID for this transformation. This is used to assign the
	 * same operator ID across job restarts. There is also the automatically
	 * generated {@link #id}, which is assigned from a static counter. That
	 * field is independent from this.
	 */
	private String uid;

	private String userProvidedNodeHash;

	protected long bufferTimeout = -1;

	private String slotSharingGroup;

	/**
	 * Creates a new {@code StreamTransformation} with the given name, output type and parallelism.
	 *
	 * @param name The name of the {@code StreamTransformation}, this will be shown in Visualizations and the Log
	 * @param outputType The output type of this {@code StreamTransformation}
	 * @param parallelism The parallelism of this {@code StreamTransformation}
	 */
	public StreamTransformation(String name, TypeInformation outputType, int parallelism) {
		this.id = getNewNodeId();
		this.name = Preconditions.checkNotNull(name);
		this.outputType = outputType;
		this.parallelism = parallelism;
		this.slotSharingGroup = null;
	}

	/**
	 * Returns the unique ID of this {@code StreamTransformation}.
	 */
	public int getId() {
		return id;
	}

	/**
	 * Changes the name of this {@code StreamTransformation}.
	 */
	public void setName(String name) {
		this.name = name;
	}

	/**
	 * Returns the name of this {@code StreamTransformation}.
	 */
	public String getName() {
		return name;
	}

	/**
	 * Returns the parallelism of this {@code StreamTransformation}.
	 */
	public int getParallelism() {
		return parallelism;
	}

	/**
	 * Sets the parallelism of this {@code StreamTransformation}.
	 *
	 * @param parallelism The new parallelism to set on this {@code StreamTransformation}.
	 */
	public void setParallelism(int parallelism) {
		Preconditions.checkArgument(
				parallelism > 0 || parallelism == ExecutionConfig.PARALLELISM_DEFAULT,
				"The parallelism must be at least one, or ExecutionConfig.PARALLELISM_DEFAULT (use system default).");
		this.parallelism = parallelism;
	}

	/**
	 * Gets the maximum parallelism for this stream transformation.
	 *
	 * @return Maximum parallelism of this transformation.
	 */
	public int getMaxParallelism() {
		return maxParallelism;
	}

	/**
	 * Sets the maximum parallelism for this stream transformation.
	 *
	 * @param maxParallelism Maximum parallelism for this stream transformation.
	 */
	public void setMaxParallelism(int maxParallelism) {
		Preconditions.checkArgument(maxParallelism > 0
						&& maxParallelism <= StreamGraphGenerator.UPPER_BOUND_MAX_PARALLELISM,
				"Maximum parallelism must be between 1 and " + StreamGraphGenerator.UPPER_BOUND_MAX_PARALLELISM
						+ ". Found: " + maxParallelism);
		this.maxParallelism = maxParallelism;
	}

	/**
	 * Sets the minimum and preferred resources for this stream transformation.
	 *
	 * @param minResources The minimum resource of this transformation.
	 * @param preferredResources The preferred resource of this transformation.
	 */
	public void setResources(ResourceSpec minResources, ResourceSpec preferredResources) {
		this.minResources = checkNotNull(minResources);
		this.preferredResources = checkNotNull(preferredResources);
	}

	/**
	 * Gets the minimum resource of this stream transformation.
	 *
	 * @return The minimum resource of this transformation.
	 */
	public ResourceSpec getMinResources() {
		return minResources;
	}

	/**
	 * Gets the preferred resource of this stream transformation.
	 *
	 * @return The preferred resource of this transformation.
	 */
	public ResourceSpec getPreferredResources() {
		return preferredResources;
	}

	/**
	 * Sets an user provided hash for this operator. This will be used AS IS the create the
	 * JobVertexID.
	 *
	 * 
The user provided hash is an alternative to the generated hashes, that is considered when
	 * identifying an operator through the default hash mechanics fails (e.g. because of changes
	 * between Flink versions).
	 *
	 * 
Important: this should be used as a workaround or for trouble shooting.
	 * The provided hash needs to be unique per transformation and job. Otherwise, job submission
	 * will fail. Furthermore, you cannot assign user-specified hash to intermediate nodes in an
	 * operator chain and trying so will let your job fail.
	 *
	 * 
A use case for this is in migration between Flink versions or changing the jobs in a way
	 * that changes the automatically generated hashes. In this case, providing the previous hashes
	 * directly through this method (e.g. obtained from old logs) can help to reestablish a lost
	 * mapping from states to their target operator.
	 *
	 * @param uidHash The user provided hash for this operator. This will become the JobVertexID, which is shown in the
	 *                 logs and web ui.
	 */
	public void setUidHash(String uidHash) {

		Preconditions.checkNotNull(uidHash);
		Preconditions.checkArgument(uidHash.matches("^[0-9A-Fa-f]{32}$"),
				"Node hash must be a 32 character String that describes a hex code. Found: " + uidHash);

		this.userProvidedNodeHash = uidHash;
	}

	/**
	 * Gets the user provided hash.
	 *
	 * @return The user provided hash.
	 */
	public String getUserProvidedNodeHash() {
		return userProvidedNodeHash;
	}

	/**
	 * Sets an ID for this {@link StreamTransformation}. This is will later be hashed to a uidHash which is then used to
	 * create the JobVertexID (that is shown in logs and the web ui).
	 *
	 * 
The specified ID is used to assign the same operator ID across job
	 * submissions (for example when starting a job from a savepoint).
	 *
	 * 
Important: this ID needs to be unique per
	 * transformation and job. Otherwise, job submission will fail.
	 *
	 * @param uid The unique user-specified ID of this transformation.
	 */
	public void setUid(String uid) {
		this.uid = uid;
	}

	/**
	 * Returns the user-specified ID of this transformation.
	 *
	 * @return The unique user-specified ID of this transformation.
	 */
	public String getUid() {
		return uid;
	}

	/**
	 * Returns the slot sharing group of this transformation.
	 *
	 * @see #setSlotSharingGroup(String)
	 */
	public String getSlotSharingGroup() {
		return slotSharingGroup;
	}

	/**
	 * Sets the slot sharing group of this transformation. Parallel instances of operations that
	 * are in the same slot sharing group will be co-located in the same TaskManager slot, if
	 * possible.
	 *
	 * 
Initially, an operation is in the default slot sharing group. This can be explicitly
	 * set using {@code setSlotSharingGroup("default")}.
	 *
	 * @param slotSharingGroup The slot sharing group name.
	 */
	public void setSlotSharingGroup(String slotSharingGroup) {
		this.slotSharingGroup = slotSharingGroup;
	}

	/**
	 * Tries to fill in the type information. Type information can be filled in
	 * later when the program uses a type hint. This method checks whether the
	 * type information has ever been accessed before and does not allow
	 * modifications if the type was accessed already. This ensures consistency
	 * by making sure different parts of the operation do not assume different
	 * type information.
	 *
	 * @param outputType The type information to fill in.
	 *
	 * @throws IllegalStateException Thrown, if the type information has been accessed before.
	 */
	public void setOutputType(TypeInformation outputType) {
		if (typeUsed) {
			throw new IllegalStateException(
					"TypeInformation cannot be filled in for the type after it has been used. "
							+ "Please make sure that the type info hints are the first call after"
							+ " the transformation function, "
							+ "before any access to types or semantic properties, etc.");
		}
		this.outputType = outputType;
	}

	/**
	 * Returns the output type of this {@code StreamTransformation} as a {@link TypeInformation}. Once
	 * this is used once the output type cannot be changed anymore using {@link #setOutputType}.
	 *
	 * @return The output type of this {@code StreamTransformation}
	 */
	public TypeInformation getOutputType() {
		if (outputType instanceof MissingTypeInfo) {
			MissingTypeInfo typeInfo = (MissingTypeInfo) this.outputType;
			throw new InvalidTypesException(
					"The return type of function '"
							+ typeInfo.getFunctionName()
							+ "' could not be determined automatically, due to type erasure. "
							+ "You can give type information hints by using the returns(...) "
							+ "method on the result of the transformation call, or by letting "
							+ "your function implement the 'ResultTypeQueryable' "
							+ "interface.", typeInfo.getTypeException());
		}
		typeUsed = true;
		return this.outputType;
	}

	/**
	 * Sets the chaining strategy of this {@code StreamTransformation}.
	 */
	public abstract void setChainingStrategy(ChainingStrategy strategy);

	/**
	 * Set the buffer timeout of this {@code StreamTransformation}. The timeout defines how long data
	 * may linger in a partially full buffer before being sent over the network.
	 *
	 * 
Lower timeouts lead to lower tail latencies, but may affect throughput.
	 * For Flink 1.5+, timeouts of 1ms are feasible for jobs with high parallelism.
	 *
	 * A value of -1 means that the default buffer timeout should be used. A value
	 * of zero indicates that no buffering should happen, and all records/events should be
	 * immediately sent through the network, without additional buffering.
	 */
	public void setBufferTimeout(long bufferTimeout) {
		checkArgument(bufferTimeout >= -1);
		this.bufferTimeout = bufferTimeout;
	}

	/**
	 * Returns the buffer timeout of this {@code StreamTransformation}.
	 *
	 * @see #setBufferTimeout(long)
	 */
	public long getBufferTimeout() {
		return bufferTimeout;
	}

	/**
	 * Returns all transitive predecessor {@code StreamTransformation}s of this {@code StreamTransformation}. This
	 * is, for example, used when determining whether a feedback edge of an iteration
	 * actually has the iteration head as a predecessor.
	 *
	 * @return The list of transitive predecessors.
	 */
	public abstract Collection> getTransitivePredecessors();

	@Override
	public String toString() {
		return getClass().getSimpleName() + "{" +
				"id=" + id +
				", name='" + name + '\'' +
				", outputType=" + outputType +
				", parallelism=" + parallelism +
				'}';
	}

	@Override
	public boolean equals(Object o) {
		if (this == o) {
			return true;
		}
		if (!(o instanceof StreamTransformation)) {
			return false;
		}

		StreamTransformation that = (StreamTransformation) o;

		if (bufferTimeout != that.bufferTimeout) {
			return false;
		}
		if (id != that.id) {
			return false;
		}
		if (parallelism != that.parallelism) {
			return false;
		}
		if (!name.equals(that.name)) {
			return false;
		}
		return outputType != null ? outputType.equals(that.outputType) : that.outputType == null;
	}

	@Override
	public int hashCode() {
		int result = id;
		result = 31 * result + name.hashCode();
		result = 31 * result + (outputType != null ? outputType.hashCode() : 0);
		result = 31 * result + parallelism;
		result = 31 * result + (int) (bufferTimeout ^ (bufferTimeout >>> 32));
		return result;
	}
}