io.streamthoughts.kafka.connect.filepulse.source.FilePulseSourceTask Maven / Gradle / Ivy
/*
* Copyright 2019-2020 StreamThoughts.
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.streamthoughts.kafka.connect.filepulse.source;
import io.streamthoughts.kafka.connect.filepulse.config.SourceTaskConfig;
import io.streamthoughts.kafka.connect.filepulse.data.TypedStruct;
import io.streamthoughts.kafka.connect.filepulse.errors.ConnectFilePulseException;
import io.streamthoughts.kafka.connect.filepulse.filter.DefaultRecordFilterPipeline;
import io.streamthoughts.kafka.connect.filepulse.filter.RecordFilterPipeline;
import io.streamthoughts.kafka.connect.filepulse.fs.TaskFileURIProvider;
import io.streamthoughts.kafka.connect.filepulse.reader.RecordsIterable;
import io.streamthoughts.kafka.connect.filepulse.state.StateBackingStoreAccess;
import org.apache.kafka.connect.data.Schema;
import org.apache.kafka.connect.source.SourceRecord;
import org.apache.kafka.connect.source.SourceTask;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.locks.ReentrantLock;
import java.util.stream.Collectors;
/**
* The FilePulseSourceTask.
*/
public class FilePulseSourceTask extends SourceTask {
private static final Logger LOG = LoggerFactory.getLogger(FilePulseSourceTask.class);
private static final int CONSECUTIVE_WAITS_BEFORE_RETURN = 3;
private static final String CONNECT_NAME_CONFIG = "name";
public SourceTaskConfig taskConfig;
private String defaultTopic;
private DefaultFileRecordsPollingConsumer consumer;
private SourceOffsetPolicy offsetPolicy;
private FileObjectStateReporter reporter;
private volatile FileObjectContext contextToBeCommitted;
private StateBackingStoreAccess sharedStore;
private TaskFileURIProvider fileURIProvider;
private String connectorGroupName;
/**
* Used to check if the task's resources was closed.
*/
private final AtomicBoolean isResourceClosed = new AtomicBoolean(false);
/**
* Used to check if the task is idle.
*/
private final AtomicBoolean isIdle = new AtomicBoolean(false);
private final ConcurrentLinkedQueue completedToCommit = new ConcurrentLinkedQueue<>();
private final Map valueSchemas = new HashMap<>();
private final AtomicLong taskThreadId = new AtomicLong(0);
private final AtomicReference state = new AtomicReference<>(State.STOPPED);
/**
* Used to ensure that start(), stop() and commitRecord() calls are serialized.
*/
private final ReentrantLock stateLock = new ReentrantLock();
protected enum State {RUNNING, STOPPED}
/**
* {@inheritDoc}
*/
@Override
public String version() {
return new FilePulseSourceConnector().version();
}
/**
* {@inheritDoc}
*/
@Override
public void start(final Map props) {
stateLock.lock();
try {
if (!state.compareAndSet(State.STOPPED, State.RUNNING)) {
LOG.info("Connector has already been started");
return;
}
LOG.info("Starting FilePulse source task");
final Map configProperties = new HashMap<>(props);
taskConfig = new SourceTaskConfig(configProperties);
connectorGroupName = props.get(CONNECT_NAME_CONFIG);
offsetPolicy = taskConfig.getSourceOffsetPolicy();
defaultTopic = taskConfig.topic();
valueSchemas.put(defaultTopic, taskConfig.getValueConnectSchema());
sharedStore = new StateBackingStoreAccess(
connectorGroupName,
taskConfig::getStateBackingStore,
true
);
reporter = new FileObjectStateReporter(sharedStore.get().getResource()) {
@Override
public void onCompleted(final FileObjectContext context) {
super.onCompleted(context);
completedToCommit.add(context);
}
};
consumer = newFileRecordsPollingConsumer();
consumer.setStateListener(reporter);
fileURIProvider = taskConfig.getFileURIProvider();
taskThreadId.set(Thread.currentThread().getId());
LOG.info("Started FilePulse source task");
} catch (final Throwable t) {
// This task has failed, so close any resources before throwing
closeResources();
throw t;
} finally {
stateLock.unlock();
}
}
private DefaultFileRecordsPollingConsumer newFileRecordsPollingConsumer() {
final RecordFilterPipeline> filter = new DefaultRecordFilterPipeline(
taskConfig.filters()
);
return new DefaultFileRecordsPollingConsumer(
context,
taskConfig.reader(),
filter,
offsetPolicy,
taskConfig.isReadCommittedFile());
}
/**
* {@inheritDoc}
*/
@Override
public List poll() throws InterruptedException {
if (isIdle.get()) {
LOG.trace("Cannot poll new data. FilePulse source task is in IDLE state");
busyWait();
return null;
}
LOG.trace("Polling for new data");
try {
final MaxConsecutiveAttempts consecutiveWaits = new MaxConsecutiveAttempts(CONSECUTIVE_WAITS_BEFORE_RETURN);
contextToBeCommitted = consumer.context();
while (isTaskRunning()) {
List results = null;
if (!consumer.hasNext()) {
contextToBeCommitted = null;
if (fileURIProvider.hasMore()) {
consumer.addAll(fileURIProvider.nextURIs());
// fileURIProvider may have more URIs but still return empty collection
// if no file is immediately available. In this case, this method should
// be blocked before returning.
if (!consumer.hasNext() && consecutiveWaits.checkAndDecrement()) {
// Check if the SourceTask is still running to
// return immediately instead of waiting
if (isTaskRunning()) busyWait();
continue;
}
// No more data can be read from this Source Task
} else {
LOG.info(
"Completed all object files. FilePulse source task is transiting to " +
"IDLE state while waiting for new reconfiguration request from source connector."
);
isIdle.set(true);
return null;
}
} else {
try {
final RecordsIterable> records = consumer.next();
if (!records.isEmpty()) {
final FileObjectContext context = consumer.context();
LOG.debug("Returning {} records for {}", records.size(), context.metadata());
results = records.stream()
.map(r -> buildSourceRecord(context, r))
.collect(Collectors.toList());
// Check if the SourceTask is still running to
// return immediately instead of waiting
} else if (isTaskRunning() &&
consumer.hasNext() &&
consecutiveWaits.checkAndDecrement()) {
busyWait();
continue;
}
} catch (ConnectFilePulseException e) {
if (taskConfig.isTaskHaltOnError()) {
throw e;
} else {
LOG.error("Caught unexpected error while processing file. Ignore and continue", e);
}
}
}
// Check if the SourceTask should stop to close resources.
if (!isTaskRunning()) continue;
return results;
}
} catch (final Throwable t) {
// This task has failed, so close any resources (maybe reopened if needed) before throwing
LOG.error("This task has failed due to uncaught error and will be stopped.");
closeResources();
throw t;
}
// Only in case of shutdown
closeResources();
LOG.info("Stopped FilePulse source task.");
return null;
}
private boolean isTaskRunning() {
return state.get() == State.RUNNING;
}
private void busyWait() throws InterruptedException {
LOG.trace("Waiting {} ms to execute next poll iteration", taskConfig.getTaskEmptyPollWaitMs());
Thread.sleep(taskConfig.getTaskEmptyPollWaitMs());
}
/**
* {@inheritDoc}
*/
@Override
public void commit() {
boolean locked = stateLock.tryLock();
if (locked) {
try {
if (isTaskRunning() && contextToBeCommitted != null) {
reporter.notify(contextToBeCommitted, FileObjectStatus.READING);
}
if (!isResourceClosed.get()) {
while (!completedToCommit.isEmpty()) {
final FileObjectContext file = completedToCommit.poll();
LOG.info("Committed offset for file: {}", file.metadata());
safelyCommit(file);
}
// If in IDLE state then we can close resources
// without waiting for the task to be stopped.
if (isIdle.get()) {
closeResources();
}
}
} finally {
stateLock.unlock();
}
} else {
LOG.warn("Couldn't commit due to a concurrent connector shutdown or restart");
}
}
private void safelyCommit(final FileObjectContext committed) {
try {
reporter.notify(committed, FileObjectStatus.COMMITTED);
} catch (Exception e) {
LOG.warn("Failed to notify committed file: {}", context, e);
}
}
private SourceRecord buildSourceRecord(final FileObjectContext context,
final FileRecord> record) {
final FileObjectMeta metadata = context.metadata();
final Map sourcePartition = offsetPolicy.toPartitionMap(metadata);
final Map sourceOffsets = offsetPolicy.toOffsetMap(record.offset().toSourceOffset());
try {
final SourceRecord result = record.toSourceRecord(
sourcePartition,
sourceOffsets,
context.metadata(),
defaultTopic,
null,
valueSchemas::get,
new FileRecord.ConnectSchemaMapperOptions(
taskConfig.isValueConnectSchemaMergeEnabled(),
taskConfig.isSchemaKeepLeadingUnderscoreOnFieldName()
)
);
if (taskConfig.isValueConnectSchemaMergeEnabled()) {
valueSchemas.put(result.topic(), result.valueSchema());
}
return result;
} catch (final Throwable t) {
var exception = new ConnectFilePulseException(String.format(
"Failed to convert data into Kafka Connect record at offset %s from object-file: %s'",
context.offset(),
context.metadata()),
t
);
// Close internal iterator for the current object-file so that it will be marked as failed
consumer.closeCurrentIterator(exception);
throw exception;
}
}
/**
* {@inheritDoc}
*/
@Override
public void stop() {
stateLock.lock();
try {
if (!state.compareAndSet(State.RUNNING, State.STOPPED)) {
LOG.info("Task has already been stopped");
return;
}
LOG.info("Stopping FilePulse source task");
doStop();
} finally {
stateLock.unlock();
}
}
private void doStop() {
// In earlier versions of Kafka Connect, 'SourceTask::stop()' was not called from the task thread.
// In this case, resources should be closed at the end of 'SourceTask::poll()'
// when no longer running or if there is an error.
// Since https://issues.apache.org/jira/browse/KAFKA-10792 the SourceTask::stop()
// is called from the source task's dedicated thread
if (taskThreadId.longValue() == Thread.currentThread().getId()) {
closeResources();
LOG.info("Stopped FilePulse source task.");
}
}
private void closeResources() {
if (!isResourceClosed.compareAndSet(false, true)) {
LOG.info("Task's resources have already been closed");
return;
}
LOG.info("Closing resources FilePulse source task");
try {
if (consumer != null) {
try {
consumer.close();
} catch (final Throwable t) {
LOG.warn("Failed to close FileRecordsPollingConsumer. Error: {}", t.getMessage());
}
}
if (fileURIProvider != null) {
try {
fileURIProvider.close();
} catch (final Exception e) {
LOG.warn("Failed to close FileURIProvider. Error: {}", e.getMessage());
}
}
} finally {
contextToBeCommitted = null;
consumer = null;
reporter = null;
closeSharedStateBackingStore();
LOG.info("Closed resources FilePulse source task");
}
}
private void closeSharedStateBackingStore() {
try {
if (sharedStore != null) {
sharedStore.close();
}
} catch (Exception exception) {
LOG.error("Failed to shared StateBackingStore '{}'", connectorGroupName);
}
}
static final class MaxConsecutiveAttempts {
final AtomicInteger consecutiveAttempts;
MaxConsecutiveAttempts(final int maxConsecutiveAttempts) {
if (maxConsecutiveAttempts <= 0) {
throw new IllegalArgumentException("'maxConsecutiveAttempts' must be superior to 0");
}
this.consecutiveAttempts = new AtomicInteger(maxConsecutiveAttempts);
}
public boolean checkAndDecrement() {
if (getRemaining() < 0) {
throw new IllegalStateException("cannot make a new consecutive attempt (remaining=0)");
}
return this.consecutiveAttempts.getAndDecrement() > 0;
}
int getRemaining() {
return consecutiveAttempts.get();
}
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy