Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.utilities.sources;
import org.apache.hudi.HoodieConversionUtils;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.table.checkpoint.Checkpoint;
import org.apache.hudi.common.table.checkpoint.StreamerCheckpointV2;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.util.Lazy;
import org.apache.hudi.utilities.config.PulsarSourceConfig;
import org.apache.hudi.utilities.exception.HoodieReadFromSourceException;
import org.apache.hudi.utilities.schema.SchemaProvider;
import org.apache.pulsar.client.api.Consumer;
import org.apache.pulsar.client.api.MessageId;
import org.apache.pulsar.client.api.PulsarClient;
import org.apache.pulsar.client.api.PulsarClientException;
import org.apache.pulsar.client.api.SubscriptionInitialPosition;
import org.apache.pulsar.client.api.SubscriptionType;
import org.apache.pulsar.client.impl.PulsarClientImpl;
import org.apache.pulsar.common.naming.TopicName;
import org.apache.pulsar.shade.io.netty.channel.EventLoopGroup;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.pulsar.JsonUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.Closeable;
import java.io.IOException;
import java.time.Duration;
import java.util.Arrays;
import java.util.Collections;
import java.util.concurrent.TimeUnit;
import static org.apache.hudi.common.util.ConfigUtils.checkRequiredConfigProperties;
import static org.apache.hudi.common.util.ConfigUtils.getLongWithAltKeys;
import static org.apache.hudi.common.util.ConfigUtils.getStringWithAltKeys;
import static org.apache.hudi.common.util.ThreadUtils.collectActiveThreads;
import static org.apache.hudi.utilities.config.PulsarSourceConfig.PULSAR_SOURCE_ADMIN_ENDPOINT_URL;
import static org.apache.hudi.utilities.config.PulsarSourceConfig.PULSAR_SOURCE_MAX_RECORDS_PER_BATCH_THRESHOLD;
import static org.apache.hudi.utilities.config.PulsarSourceConfig.PULSAR_SOURCE_OFFSET_AUTO_RESET_STRATEGY;
import static org.apache.hudi.utilities.config.PulsarSourceConfig.PULSAR_SOURCE_SERVICE_ENDPOINT_URL;
import static org.apache.hudi.utilities.config.PulsarSourceConfig.PULSAR_SOURCE_TOPIC_NAME;
/**
* Source fetching data from Pulsar topics
*/
public class PulsarSource extends RowSource implements Closeable {
private static final Logger LOG = LoggerFactory.getLogger(PulsarSource.class);
private static final Duration GRACEFUL_SHUTDOWN_TIMEOUT = Duration.ofSeconds(20);
private static final String HUDI_PULSAR_CONSUMER_ID_FORMAT = "hudi-pulsar-consumer-%d";
private static final String[] PULSAR_META_FIELDS = new String[]{
"__key",
"__topic",
"__messageId",
"__publishTime",
"__eventTime",
"__messageProperties"
};
private final String topicName;
private final String serviceEndpointURL;
private final String adminEndpointURL;
// NOTE: We're keeping the client so that we can shut it down properly
private final Lazy pulsarClient;
private final Lazy> pulsarConsumer;
public PulsarSource(TypedProperties props,
JavaSparkContext sparkContext,
SparkSession sparkSession,
SchemaProvider schemaProvider) {
super(props, sparkContext, sparkSession, schemaProvider);
checkRequiredConfigProperties(props,
Arrays.asList(PULSAR_SOURCE_TOPIC_NAME, PULSAR_SOURCE_SERVICE_ENDPOINT_URL));
// Converting to a descriptor allows us to canonicalize the topic's name properly
this.topicName = TopicName.get(getStringWithAltKeys(props, PULSAR_SOURCE_TOPIC_NAME)).toString();
// TODO validate endpoints provided in the appropriate format
this.serviceEndpointURL = getStringWithAltKeys(props, PULSAR_SOURCE_SERVICE_ENDPOINT_URL);
this.adminEndpointURL = getStringWithAltKeys(props, PULSAR_SOURCE_ADMIN_ENDPOINT_URL);
this.pulsarClient = Lazy.lazily(this::initPulsarClient);
this.pulsarConsumer = Lazy.lazily(this::subscribeToTopic);
}
@Override
protected Pair