com.google.cloud.dataflow.sdk.io.PubsubIO Maven / Gradle / Ivy
Show all versions of google-cloud-dataflow-java-sdk-all Show documentation
/*
* Copyright (C) 2015 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package com.google.cloud.dataflow.sdk.io;
import static com.google.common.base.MoreObjects.firstNonNull;
import static com.google.common.base.Preconditions.checkArgument;
import com.google.api.client.util.Clock;
import com.google.api.client.util.DateTime;
import com.google.api.services.pubsub.Pubsub;
import com.google.api.services.pubsub.model.AcknowledgeRequest;
import com.google.api.services.pubsub.model.PublishRequest;
import com.google.api.services.pubsub.model.PubsubMessage;
import com.google.api.services.pubsub.model.PullRequest;
import com.google.api.services.pubsub.model.PullResponse;
import com.google.api.services.pubsub.model.ReceivedMessage;
import com.google.api.services.pubsub.model.Subscription;
import com.google.cloud.dataflow.sdk.coders.Coder;
import com.google.cloud.dataflow.sdk.coders.StringUtf8Coder;
import com.google.cloud.dataflow.sdk.coders.VoidCoder;
import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions;
import com.google.cloud.dataflow.sdk.runners.DataflowPipelineRunner;
import com.google.cloud.dataflow.sdk.runners.DirectPipelineRunner;
import com.google.cloud.dataflow.sdk.runners.PipelineRunner;
import com.google.cloud.dataflow.sdk.transforms.Create;
import com.google.cloud.dataflow.sdk.transforms.DoFn;
import com.google.cloud.dataflow.sdk.transforms.PTransform;
import com.google.cloud.dataflow.sdk.transforms.ParDo;
import com.google.cloud.dataflow.sdk.transforms.windowing.AfterWatermark;
import com.google.cloud.dataflow.sdk.util.CoderUtils;
import com.google.cloud.dataflow.sdk.util.Transport;
import com.google.cloud.dataflow.sdk.util.WindowingStrategy;
import com.google.cloud.dataflow.sdk.values.PCollection;
import com.google.cloud.dataflow.sdk.values.PCollection.IsBounded;
import com.google.cloud.dataflow.sdk.values.PDone;
import com.google.cloud.dataflow.sdk.values.PInput;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Throwables;
import com.google.common.collect.ImmutableMap;
import org.joda.time.Duration;
import org.joda.time.Instant;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.annotation.Nullable;
/**
* Read and Write {@link PTransform}s for Cloud Pub/Sub streams. These transforms create
* and consume unbounded {@link PCollection PCollections}.
*
* Permissions
* Permission requirements depend on the {@link PipelineRunner} that is used to execute the
* Dataflow job. Please refer to the documentation of corresponding
* {@link PipelineRunner PipelineRunners} for more details.
*/
public class PubsubIO {
private static final Logger LOG = LoggerFactory.getLogger(PubsubIO.class);
/** The default {@link Coder} used to translate to/from Cloud Pub/Sub messages. */
public static final Coder DEFAULT_PUBSUB_CODER = StringUtf8Coder.of();
/**
* Project IDs must contain 6-63 lowercase letters, digits, or dashes.
* IDs must start with a letter and may not end with a dash.
* This regex isn't exact - this allows for patterns that would be rejected by
* the service, but this is sufficient for basic parsing of table references.
*/
private static final Pattern PROJECT_ID_REGEXP =
Pattern.compile("[a-z][-a-z0-9:.]{4,61}[a-z0-9]");
private static final Pattern SUBSCRIPTION_REGEXP =
Pattern.compile("projects/([^/]+)/subscriptions/(.+)");
private static final Pattern TOPIC_REGEXP = Pattern.compile("projects/([^/]+)/topics/(.+)");
private static final Pattern V1BETA1_SUBSCRIPTION_REGEXP =
Pattern.compile("/subscriptions/([^/]+)/(.+)");
private static final Pattern V1BETA1_TOPIC_REGEXP = Pattern.compile("/topics/([^/]+)/(.+)");
private static final Pattern PUBSUB_NAME_REGEXP = Pattern.compile("[a-zA-Z][-._~%+a-zA-Z0-9]+");
private static final int PUBSUB_NAME_MAX_LENGTH = 255;
private static final String SUBSCRIPTION_RANDOM_TEST_PREFIX = "_random/";
private static final String SUBSCRIPTION_STARTING_SIGNAL = "_starting_signal/";
private static final String TOPIC_DEV_NULL_TEST_NAME = "/topics/dev/null";
private static void validateProjectName(String project) {
Matcher match = PROJECT_ID_REGEXP.matcher(project);
if (!match.matches()) {
throw new IllegalArgumentException(
"Illegal project name specified in Pubsub subscription: " + project);
}
}
private static void validatePubsubName(String name) {
if (name.length() > PUBSUB_NAME_MAX_LENGTH) {
throw new IllegalArgumentException(
"Pubsub object name is longer than 255 characters: " + name);
}
if (name.startsWith("goog")) {
throw new IllegalArgumentException("Pubsub object name cannot start with goog: " + name);
}
Matcher match = PUBSUB_NAME_REGEXP.matcher(name);
if (!match.matches()) {
throw new IllegalArgumentException("Illegal Pubsub object name specified: " + name
+ " Please see Javadoc for naming rules.");
}
}
/**
* Returns the {@link Instant} that corresponds to the timestamp in the supplied
* {@link PubsubMessage} under the specified {@code ink label}. See
* {@link PubsubIO.Read#timestampLabel(String)} for details about how these messages are
* parsed.
*
* The {@link Clock} parameter is used to virtualize time for testing.
*
* @throws IllegalArgumentException if the timestamp label is provided, but there is no
* corresponding attribute in the message or the value provided is not a valid timestamp
* string.
* @see PubsubIO.Read#timestampLabel(String)
*/
@VisibleForTesting
protected static Instant assignMessageTimestamp(
PubsubMessage message, @Nullable String label, Clock clock) {
if (label == null) {
return new Instant(clock.currentTimeMillis());
}
// Extract message attributes, defaulting to empty map if null.
Map attributes = firstNonNull(
message.getAttributes(), ImmutableMap.of());
String timestampStr = attributes.get(label);
checkArgument(timestampStr != null && !timestampStr.isEmpty(),
"PubSub message is missing a timestamp in label: %s", label);
long millisSinceEpoch;
try {
// Try parsing as milliseconds since epoch. Note there is no way to parse a string in
// RFC 3339 format here.
// Expected IllegalArgumentException if parsing fails; we use that to fall back to RFC 3339.
millisSinceEpoch = Long.parseLong(timestampStr);
} catch (IllegalArgumentException e) {
// Try parsing as RFC3339 string. DateTime.parseRfc3339 will throw an IllegalArgumentException
// if parsing fails, and the caller should handle.
millisSinceEpoch = DateTime.parseRfc3339(timestampStr).getValue();
}
return new Instant(millisSinceEpoch);
}
/**
* Class representing a Cloud Pub/Sub Subscription.
*/
public static class PubsubSubscription implements Serializable {
private enum Type { NORMAL, FAKE }
private final Type type;
private final String project;
private final String subscription;
private PubsubSubscription(Type type, String project, String subscription) {
this.type = type;
this.project = project;
this.subscription = subscription;
}
/**
* Creates a class representing a Pub/Sub subscription from the specified subscription path.
*
* Cloud Pub/Sub subscription names should be of the form
* {@code projects//subscriptions/}, where {@code } is the name
* of the project the subscription belongs to. The {@code } component must comply
* with the following requirements:
*
*
* - Can only contain lowercase letters, numbers, dashes ('-'), underscores ('_') and periods
* ('.').
* - Must be between 3 and 255 characters.
* - Must begin with a letter.
* - Must end with a letter or a number.
* - Cannot begin with {@code 'goog'} prefix.
*
*/
public static PubsubSubscription fromPath(String path) {
if (path.startsWith(SUBSCRIPTION_RANDOM_TEST_PREFIX)
|| path.startsWith(SUBSCRIPTION_STARTING_SIGNAL)) {
return new PubsubSubscription(Type.FAKE, "", path);
}
String projectName, subscriptionName;
Matcher v1beta1Match = V1BETA1_SUBSCRIPTION_REGEXP.matcher(path);
if (v1beta1Match.matches()) {
LOG.warn("Saw subscription in v1beta1 format. Subscriptions should be in the format "
+ "projects//subscriptions/");
projectName = v1beta1Match.group(1);
subscriptionName = v1beta1Match.group(2);
} else {
Matcher match = SUBSCRIPTION_REGEXP.matcher(path);
if (!match.matches()) {
throw new IllegalArgumentException("Pubsub subscription is not in "
+ "projects//subscriptions/ format: " + path);
}
projectName = match.group(1);
subscriptionName = match.group(2);
}
validateProjectName(projectName);
validatePubsubName(subscriptionName);
return new PubsubSubscription(Type.NORMAL, projectName, subscriptionName);
}
/**
* Returns the string representation of this subscription as a path used in the Cloud Pub/Sub
* v1beta1 API.
*
* @deprecated the v1beta1 API for Cloud Pub/Sub is deprecated.
*/
@Deprecated
public String asV1Beta1Path() {
if (type == Type.NORMAL) {
return "/subscriptions/" + project + "/" + subscription;
} else {
return subscription;
}
}
/**
* Returns the string representation of this subscription as a path used in the Cloud Pub/Sub
* v1beta2 API.
*
* @deprecated the v1beta2 API for Cloud Pub/Sub is deprecated.
*/
@Deprecated
public String asV1Beta2Path() {
if (type == Type.NORMAL) {
return "projects/" + project + "/subscriptions/" + subscription;
} else {
return subscription;
}
}
/**
* Returns the string representation of this subscription as a path used in the Cloud Pub/Sub
* API.
*/
public String asPath() {
if (type == Type.NORMAL) {
return "projects/" + project + "/subscriptions/" + subscription;
} else {
return subscription;
}
}
}
/**
* Class representing a Cloud Pub/Sub Topic.
*/
public static class PubsubTopic implements Serializable {
private enum Type { NORMAL, FAKE }
private final Type type;
private final String project;
private final String topic;
private PubsubTopic(Type type, String project, String topic) {
this.type = type;
this.project = project;
this.topic = topic;
}
/**
* Creates a class representing a Cloud Pub/Sub topic from the specified topic path.
*
* Cloud Pub/Sub topic names should be of the form
* {@code /topics//}, where {@code } is the name of
* the publishing project. The {@code } component must comply with
* the following requirements:
*
*
* - Can only contain lowercase letters, numbers, dashes ('-'), underscores ('_') and periods
* ('.').
* - Must be between 3 and 255 characters.
* - Must begin with a letter.
* - Must end with a letter or a number.
* - Cannot begin with 'goog' prefix.
*
*/
public static PubsubTopic fromPath(String path) {
if (path.equals(TOPIC_DEV_NULL_TEST_NAME)) {
return new PubsubTopic(Type.FAKE, "", path);
}
String projectName, topicName;
Matcher v1beta1Match = V1BETA1_TOPIC_REGEXP.matcher(path);
if (v1beta1Match.matches()) {
LOG.warn("Saw topic in v1beta1 format. Topics should be in the format "
+ "projects//topics/");
projectName = v1beta1Match.group(1);
topicName = v1beta1Match.group(2);
} else {
Matcher match = TOPIC_REGEXP.matcher(path);
if (!match.matches()) {
throw new IllegalArgumentException(
"Pubsub topic is not in projects//topics/ format: " + path);
}
projectName = match.group(1);
topicName = match.group(2);
}
validateProjectName(projectName);
validatePubsubName(topicName);
return new PubsubTopic(Type.NORMAL, projectName, topicName);
}
/**
* Returns the string representation of this topic as a path used in the Cloud Pub/Sub
* v1beta1 API.
*
* @deprecated the v1beta1 API for Cloud Pub/Sub is deprecated.
*/
@Deprecated
public String asV1Beta1Path() {
if (type == Type.NORMAL) {
return "/topics/" + project + "/" + topic;
} else {
return topic;
}
}
/**
* Returns the string representation of this topic as a path used in the Cloud Pub/Sub
* v1beta2 API.
*
* @deprecated the v1beta2 API for Cloud Pub/Sub is deprecated.
*/
@Deprecated
public String asV1Beta2Path() {
if (type == Type.NORMAL) {
return "projects/" + project + "/topics/" + topic;
} else {
return topic;
}
}
/**
* Returns the string representation of this topic as a path used in the Cloud Pub/Sub
* API.
*/
public String asPath() {
if (type == Type.NORMAL) {
return "projects/" + project + "/topics/" + topic;
} else {
return topic;
}
}
}
/**
* A {@link PTransform} that continuously reads from a Cloud Pub/Sub stream and
* returns a {@link PCollection} of {@link String Strings} containing the items from
* the stream.
*
* When running with a {@link PipelineRunner} that only supports bounded
* {@link PCollection PCollections} (such as {@link DirectPipelineRunner} or
* {@link DataflowPipelineRunner} without {@code --streaming}), only a bounded portion of the
* input Pub/Sub stream can be processed. As such, either {@link Bound#maxNumRecords(int)} or
* {@link Bound#maxReadTime(Duration)} must be set.
*/
public static class Read {
/**
* Creates and returns a transform for reading from Cloud Pub/Sub with the specified transform
* name.
*/
public static Bound named(String name) {
return new Bound<>(DEFAULT_PUBSUB_CODER).named(name);
}
/**
* Creates and returns a transform for reading from a Cloud Pub/Sub topic. Mutually exclusive
* with {@link #subscription(String)}.
*
* See {@link PubsubIO.PubsubTopic#fromPath(String)} for more details on the format
* of the {@code topic} string.
*
*
Dataflow will start reading data published on this topic from the time the pipeline is
* started. Any data published on the topic before the pipeline is started will not be read by
* Dataflow.
*/
public static Bound topic(String topic) {
return new Bound<>(DEFAULT_PUBSUB_CODER).topic(topic);
}
/**
* Creates and returns a transform for reading from a specific Cloud Pub/Sub subscription.
* Mutually exclusive with {@link #topic(String)}.
*
* See {@link PubsubIO.PubsubSubscription#fromPath(String)} for more details on the format
* of the {@code subscription} string.
*/
public static Bound subscription(String subscription) {
return new Bound<>(DEFAULT_PUBSUB_CODER).subscription(subscription);
}
/**
* Creates and returns a transform reading from Cloud Pub/Sub where record timestamps are
* expected to be provided as Pub/Sub message attributes. The {@code timestampLabel}
* parameter specifies the name of the attribute that contains the timestamp.
*
* The timestamp value is expected to be represented in the attribute as either:
*
*
* - a numerical value representing the number of milliseconds since the Unix epoch. For
* example, if using the Joda time classes, {@link Instant#getMillis()} returns the correct
* value for this attribute.
*
- a String in RFC 3339 format. For example, {@code 2015-10-29T23:41:41.123Z}. The
* sub-second component of the timestamp is optional, and digits beyond the first three
* (i.e., time units smaller than milliseconds) will be ignored.
*
*
* If {@code timestampLabel} is not provided, the system will generate record timestamps
* the first time it sees each record. All windowing will be done relative to these timestamps.
*
*
By default, windows are emitted based on an estimate of when this source is likely
* done producing data for a given timestamp (referred to as the Watermark; see
* {@link AfterWatermark} for more details). Any late data will be handled by the trigger
* specified with the windowing strategy – by default it will be output immediately.
*
*
Note that the system can guarantee that no late data will ever be seen when it assigns
* timestamps by arrival time (i.e. {@code timestampLabel} is not provided).
*
* @see RFC 3339
*/
public static Bound timestampLabel(String timestampLabel) {
return new Bound<>(DEFAULT_PUBSUB_CODER).timestampLabel(timestampLabel);
}
/**
* Creates and returns a transform for reading from Cloud Pub/Sub where unique record
* identifiers are expected to be provided as Pub/Sub message attributes. The {@code idLabel}
* parameter specifies the attribute name. The value of the attribute can be any string
* that uniquely identifies this record.
*
* If {@code idLabel} is not provided, Dataflow cannot guarantee that no duplicate data will
* be delivered on the Pub/Sub stream. In this case, deduplication of the stream will be
* strictly best effort.
*/
public static Bound idLabel(String idLabel) {
return new Bound<>(DEFAULT_PUBSUB_CODER).idLabel(idLabel);
}
/**
* Creates and returns a transform for reading from Cloud Pub/Sub that uses the given
* {@link Coder} to decode Pub/Sub messages into a value of type {@code T}.
*
* By default, uses {@link StringUtf8Coder}, which just
* returns the text lines as Java strings.
*
* @param the type of the decoded elements, and the elements
* of the resulting PCollection.
*/
public static Bound withCoder(Coder coder) {
return new Bound<>(coder);
}
/**
* Creates and returns a transform for reading from Cloud Pub/Sub with a maximum number of
* records that will be read. The transform produces a bounded {@link PCollection}.
*
* Either this option or {@link #maxReadTime(Duration)} must be set in order to create a
* bounded source.
*/
public static Bound maxNumRecords(int maxNumRecords) {
return new Bound<>(DEFAULT_PUBSUB_CODER).maxNumRecords(maxNumRecords);
}
/**
* Creates and returns a transform for reading from Cloud Pub/Sub with a maximum number of
* duration during which records will be read. The transform produces a bounded
* {@link PCollection}.
*
* Either this option or {@link #maxNumRecords(int)} must be set in order to create a bounded
* source.
*/
public static Bound maxReadTime(Duration maxReadTime) {
return new Bound<>(DEFAULT_PUBSUB_CODER).maxReadTime(maxReadTime);
}
/**
* A {@link PTransform} that reads from a Cloud Pub/Sub source and returns
* a unbounded {@link PCollection} containing the items from the stream.
*/
public static class Bound extends PTransform> {
/** The Cloud Pub/Sub topic to read from. */
@Nullable private final PubsubTopic topic;
/** The Cloud Pub/Sub subscription to read from. */
@Nullable private final PubsubSubscription subscription;
/** The name of the message attribute to read timestamps from. */
@Nullable private final String timestampLabel;
/** The name of the message attribute to read unique message IDs from. */
@Nullable private final String idLabel;
/** The coder used to decode each record. */
@Nullable private final Coder coder;
/** Stop after reading this many records. */
private final int maxNumRecords;
/** Stop after reading for this much time. */
@Nullable private final Duration maxReadTime;
private Bound(Coder coder) {
this(null, null, null, null, coder, null, 0, null);
}
private Bound(String name, PubsubSubscription subscription, PubsubTopic topic,
String timestampLabel, Coder coder, String idLabel, int maxNumRecords,
Duration maxReadTime) {
super(name);
this.subscription = subscription;
this.topic = topic;
this.timestampLabel = timestampLabel;
this.coder = coder;
this.idLabel = idLabel;
this.maxNumRecords = maxNumRecords;
this.maxReadTime = maxReadTime;
}
/**
* Returns a transform that's like this one but with the given step name.
*
* Does not modify this object.
*/
public Bound named(String name) {
return new Bound<>(
name, subscription, topic, timestampLabel, coder, idLabel, maxNumRecords, maxReadTime);
}
/**
* Returns a transform that's like this one but reading from the
* given subscription.
*
* See {@link PubsubIO.PubsubSubscription#fromPath(String)} for more details on the format
* of the {@code subscription} string.
*
*
Multiple readers reading from the same subscription will each receive
* some arbitrary portion of the data. Most likely, separate readers should
* use their own subscriptions.
*
*
Does not modify this object.
*/
public Bound subscription(String subscription) {
return new Bound<>(name, PubsubSubscription.fromPath(subscription), topic, timestampLabel,
coder, idLabel, maxNumRecords, maxReadTime);
}
/**
* Returns a transform that's like this one but that reads from the specified topic.
*
* See {@link PubsubIO.PubsubTopic#fromPath(String)} for more details on the
* format of the {@code topic} string.
*
*
Does not modify this object.
*/
public Bound topic(String topic) {
return new Bound<>(name, subscription, PubsubTopic.fromPath(topic), timestampLabel, coder,
idLabel, maxNumRecords, maxReadTime);
}
/**
* Returns a transform that's like this one but that reads message timestamps
* from the given message attribute. See {@link PubsubIO.Read#timestampLabel(String)} for
* more details on the format of the timestamp attribute.
*
* Does not modify this object.
*/
public Bound timestampLabel(String timestampLabel) {
return new Bound<>(
name, subscription, topic, timestampLabel, coder, idLabel, maxNumRecords, maxReadTime);
}
/**
* Returns a transform that's like this one but that reads unique message IDs
* from the given message attribute. See {@link PubsubIO.Read#idLabel(String)} for more
* details on the format of the ID attribute.
*
* Does not modify this object.
*/
public Bound idLabel(String idLabel) {
return new Bound<>(
name, subscription, topic, timestampLabel, coder, idLabel, maxNumRecords, maxReadTime);
}
/**
* Returns a transform that's like this one but that uses the given
* {@link Coder} to decode each record into a value of type {@code X}.
*
* Does not modify this object.
*
* @param the type of the decoded elements, and the
* elements of the resulting PCollection.
*/
public Bound withCoder(Coder coder) {
return new Bound<>(
name, subscription, topic, timestampLabel, coder, idLabel, maxNumRecords, maxReadTime);
}
/**
* Returns a transform that's like this one but will only read up to the specified
* maximum number of records from Cloud Pub/Sub. The transform produces a bounded
* {@link PCollection}. See {@link PubsubIO.Read#maxNumRecords(int)} for more details.
*/
public Bound maxNumRecords(int maxNumRecords) {
return new Bound<>(
name, subscription, topic, timestampLabel, coder, idLabel, maxNumRecords, maxReadTime);
}
/**
* Returns a transform that's like this one but will only read during the specified
* duration from Cloud Pub/Sub. The transform produces a bounded {@link PCollection}.
* See {@link PubsubIO.Read#maxReadTime(Duration)} for more details.
*/
public Bound maxReadTime(Duration maxReadTime) {
return new Bound<>(
name, subscription, topic, timestampLabel, coder, idLabel, maxNumRecords, maxReadTime);
}
@Override
public PCollection apply(PInput input) {
if (topic == null && subscription == null) {
throw new IllegalStateException("need to set either the topic or the subscription for "
+ "a PubsubIO.Read transform");
}
if (topic != null && subscription != null) {
throw new IllegalStateException("Can't set both the topic and the subscription for a "
+ "PubsubIO.Read transform");
}
boolean boundedOutput = getMaxNumRecords() > 0 || getMaxReadTime() != null;
if (boundedOutput) {
return input.getPipeline().begin()
.apply(Create.of((Void) null)).setCoder(VoidCoder.of())
.apply(ParDo.of(new PubsubReader())).setCoder(coder);
} else {
return PCollection.createPrimitiveOutputInternal(
input.getPipeline(), WindowingStrategy.globalDefault(), IsBounded.UNBOUNDED)
.setCoder(coder);
}
}
@Override
protected Coder getDefaultOutputCoder() {
return coder;
}
public PubsubTopic getTopic() {
return topic;
}
public PubsubSubscription getSubscription() {
return subscription;
}
public String getTimestampLabel() {
return timestampLabel;
}
public Coder getCoder() {
return coder;
}
public String getIdLabel() {
return idLabel;
}
public int getMaxNumRecords() {
return maxNumRecords;
}
public Duration getMaxReadTime() {
return maxReadTime;
}
private class PubsubReader extends DoFn {
private static final int DEFAULT_PULL_SIZE = 100;
@Override
public void processElement(ProcessContext c) throws IOException {
Pubsub pubsubClient =
Transport.newPubsubClient(c.getPipelineOptions().as(DataflowPipelineOptions.class))
.build();
String subscription;
if (getSubscription() == null) {
String topic = getTopic().asPath();
String[] split = topic.split("/");
subscription =
"projects/" + split[1] + "/subscriptions/" + split[3] + "_dataflow_"
+ new Random().nextLong();
Subscription subInfo = new Subscription().setAckDeadlineSeconds(60).setTopic(topic);
try {
pubsubClient.projects().subscriptions().create(subscription, subInfo).execute();
} catch (Exception e) {
throw new RuntimeException("Failed to create subscription: ", e);
}
} else {
subscription = getSubscription().asPath();
}
Instant endTime = (getMaxReadTime() == null)
? new Instant(Long.MAX_VALUE) : Instant.now().plus(getMaxReadTime());
List messages = new ArrayList<>();
Throwable finallyBlockException = null;
try {
while ((getMaxNumRecords() == 0 || messages.size() < getMaxNumRecords())
&& Instant.now().isBefore(endTime)) {
PullRequest pullRequest = new PullRequest().setReturnImmediately(false);
if (getMaxNumRecords() > 0) {
pullRequest.setMaxMessages(getMaxNumRecords() - messages.size());
} else {
pullRequest.setMaxMessages(DEFAULT_PULL_SIZE);
}
PullResponse pullResponse =
pubsubClient.projects().subscriptions().pull(subscription, pullRequest).execute();
List ackIds = new ArrayList<>();
if (pullResponse.getReceivedMessages() != null) {
for (ReceivedMessage received : pullResponse.getReceivedMessages()) {
messages.add(received.getMessage());
ackIds.add(received.getAckId());
}
}
if (ackIds.size() != 0) {
AcknowledgeRequest ackRequest = new AcknowledgeRequest().setAckIds(ackIds);
pubsubClient.projects()
.subscriptions()
.acknowledge(subscription, ackRequest)
.execute();
}
}
} catch (IOException e) {
throw new RuntimeException("Unexpected exception while reading from Pubsub: ", e);
} finally {
if (getTopic() != null) {
try {
pubsubClient.projects().subscriptions().delete(subscription).execute();
} catch (IOException e) {
finallyBlockException = new RuntimeException("Failed to delete subscription: ", e);
LOG.error("Failed to delete subscription: ", e);
}
}
}
if (finallyBlockException != null) {
Throwables.propagate(finallyBlockException);
}
for (PubsubMessage message : messages) {
c.outputWithTimestamp(
CoderUtils.decodeFromByteArray(getCoder(), message.decodeData()),
assignMessageTimestamp(message, getTimestampLabel(), Clock.SYSTEM));
}
}
}
}
/** Disallow construction of utility class. */
private Read() {}
}
/////////////////////////////////////////////////////////////////////////////
/** Disallow construction of utility class. */
private PubsubIO() {}
/**
* A {@link PTransform} that continuously writes a
* {@link PCollection} of {@link String Strings} to a Cloud Pub/Sub stream.
*/
// TODO: Support non-String encodings.
public static class Write {
/**
* Creates a transform that writes to Pub/Sub with the given step name.
*/
public static Bound named(String name) {
return new Bound<>(DEFAULT_PUBSUB_CODER).named(name);
}
/**
* Creates a transform that publishes to the specified topic.
*
* See {@link PubsubIO.PubsubTopic#fromPath(String)} for more details on the format of the
* {@code topic} string.
*/
public static Bound topic(String topic) {
return new Bound<>(DEFAULT_PUBSUB_CODER).topic(topic);
}
/**
* Creates a transform that writes to Pub/Sub, adds each record's timestamp to the published
* messages in an attribute with the specified name. The value of the attribute will be a number
* representing the number of milliseconds since the Unix epoch. For example, if using the Joda
* time classes, {@link Instant#Instant(long)} can be used to parse this value.
*
* If the output from this sink is being read by another Dataflow source, then
* {@link PubsubIO.Read#timestampLabel(String)} can be used to ensure the other source reads
* these timestamps from the appropriate attribute.
*/
public static Bound timestampLabel(String timestampLabel) {
return new Bound<>(DEFAULT_PUBSUB_CODER).timestampLabel(timestampLabel);
}
/**
* Creates a transform that writes to Pub/Sub, adding each record's unique identifier to the
* published messages in an attribute with the specified name. The value of the attribute is an
* opaque string.
*
* If the the output from this sink is being read by another Dataflow source, then
* {@link PubsubIO.Read#idLabel(String)} can be used to ensure that* the other source reads
* these unique identifiers from the appropriate attribute.
*/
public static Bound idLabel(String idLabel) {
return new Bound<>(DEFAULT_PUBSUB_CODER).idLabel(idLabel);
}
/**
* Creates a transform that uses the given {@link Coder} to encode each of the
* elements of the input collection into an output message.
*
* By default, uses {@link StringUtf8Coder}, which writes input Java strings directly as
* records.
*
* @param the type of the elements of the input PCollection
*/
public static Bound withCoder(Coder coder) {
return new Bound<>(coder);
}
/**
* A {@link PTransform} that writes an unbounded {@link PCollection} of {@link String Strings}
* to a Cloud Pub/Sub stream.
*/
public static class Bound extends PTransform, PDone> {
/** The Cloud Pub/Sub topic to publish to. */
@Nullable private final PubsubTopic topic;
/** The name of the message attribute to publish message timestamps in. */
@Nullable private final String timestampLabel;
/** The name of the message attribute to publish unique message IDs in. */
@Nullable private final String idLabel;
private final Coder coder;
private Bound(Coder coder) {
this(null, null, null, null, coder);
}
private Bound(
String name, PubsubTopic topic, String timestampLabel, String idLabel, Coder coder) {
super(name);
this.topic = topic;
this.timestampLabel = timestampLabel;
this.idLabel = idLabel;
this.coder = coder;
}
/**
* Returns a new transform that's like this one but with the specified step
* name.
*
* Does not modify this object.
*/
public Bound named(String name) {
return new Bound<>(name, topic, timestampLabel, idLabel, coder);
}
/**
* Returns a new transform that's like this one but that writes to the specified
* topic.
*
* See {@link PubsubIO.PubsubTopic#fromPath(String)} for more details on the format of the
* {@code topic} string.
*
*
Does not modify this object.
*/
public Bound topic(String topic) {
return new Bound<>(name, PubsubTopic.fromPath(topic), timestampLabel, idLabel, coder);
}
/**
* Returns a new transform that's like this one but that publishes record timestamps
* to a message attribute with the specified name. See
* {@link PubsubIO.Write#timestampLabel(String)} for more details.
*
* Does not modify this object.
*/
public Bound timestampLabel(String timestampLabel) {
return new Bound<>(name, topic, timestampLabel, idLabel, coder);
}
/**
* Returns a new transform that's like this one but that publishes unique record IDs
* to a message attribute with the specified name. See {@link PubsubIO.Write#idLabel(String)}
* for more details.
*
* Does not modify this object.
*/
public Bound idLabel(String idLabel) {
return new Bound<>(name, topic, timestampLabel, idLabel, coder);
}
/**
* Returns a new transform that's like this one
* but that uses the given {@link Coder} to encode each of
* the elements of the input {@link PCollection} into an
* output record.
*
* Does not modify this object.
*
* @param the type of the elements of the input {@link PCollection}
*/
public Bound withCoder(Coder coder) {
return new Bound<>(name, topic, timestampLabel, idLabel, coder);
}
@Override
public PDone apply(PCollection input) {
if (topic == null) {
throw new IllegalStateException("need to set the topic of a PubsubIO.Write transform");
}
input.apply(ParDo.of(new PubsubWriter()));
return PDone.in(input.getPipeline());
}
@Override
protected Coder getDefaultOutputCoder() {
return VoidCoder.of();
}
public PubsubTopic getTopic() {
return topic;
}
public String getTimestampLabel() {
return timestampLabel;
}
public String getIdLabel() {
return idLabel;
}
public Coder getCoder() {
return coder;
}
private class PubsubWriter extends DoFn {
private static final int MAX_PUBLISH_BATCH_SIZE = 100;
private transient List output;
private transient Pubsub pubsubClient;
@Override
public void startBundle(Context c) {
this.output = new ArrayList<>();
this.pubsubClient =
Transport.newPubsubClient(c.getPipelineOptions().as(DataflowPipelineOptions.class))
.build();
}
@Override
public void processElement(ProcessContext c) throws IOException {
PubsubMessage message =
new PubsubMessage().encodeData(CoderUtils.encodeToByteArray(getCoder(), c.element()));
if (getTimestampLabel() != null) {
Map attributes = message.getAttributes();
if (attributes == null) {
attributes = new HashMap<>();
message.setAttributes(attributes);
}
attributes.put(getTimestampLabel(), String.valueOf(c.timestamp().getMillis()));
}
output.add(message);
if (output.size() >= MAX_PUBLISH_BATCH_SIZE) {
publish();
}
}
@Override
public void finishBundle(Context c) throws IOException {
if (!output.isEmpty()) {
publish();
}
}
private void publish() throws IOException {
PublishRequest publishRequest = new PublishRequest().setMessages(output);
pubsubClient.projects().topics()
.publish(getTopic().asPath(), publishRequest)
.execute();
output.clear();
}
}
}
/** Disallow construction of utility class. */
private Write() {}
}
}