All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.utilities.checkpointing.KafkaConnectHdfsProvider Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.utilities.checkpointing;

import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.exception.HoodieException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Generate checkpoint from Kafka-Connect-HDFS managed data set.
 * Documentation: https://docs.confluent.io/current/connect/kafka-connect-hdfs/index.html
 */
public class KafkaConnectHdfsProvider extends InitialCheckPointProvider {
  private static String FILENAME_SEPARATOR = "[\\+\\.]";

  public KafkaConnectHdfsProvider(TypedProperties props) {
    super(props);
  }

  @Override
  public void init(Configuration config) throws HoodieException {
    try {
      this.fs = FileSystem.get(config);
    } catch (IOException e) {
      throw new HoodieException("KafkaConnectHdfsProvider initialization failed");
    }
  }

  /**
   * PathFilter for Kafka-Connect-HDFS.
   * Directory format: /partition1=xxx/partition2=xxx
   * File format: topic+partition+lowerOffset+upperOffset.file
   */
  public static class KafkaConnectPathFilter implements PathFilter {
    private static final Pattern DIRECTORY_PATTERN = Pattern.compile(".*=.*");
    private static final Pattern PATTERN =
        Pattern.compile("[a-zA-Z0-9\\._\\-]+\\+\\d+\\+\\d+\\+\\d+(.\\w+)?");

    @Override
    public boolean accept(final Path path) {
      final String filename = path.getName();
      final Matcher matcher = PATTERN.matcher(filename);
      return matcher.matches();
    }

    public boolean acceptDir(final Path path) {
      final String dirName = path.getName();
      final Matcher matcher = DIRECTORY_PATTERN.matcher(dirName);
      return matcher.matches();
    }
  }

  /**
   * Convert map contains max offset of each partition to string.
   *
   * @param topic      Topic name
   * @param checkpoint Map with partition as key and max offset as value
   * @return Checkpoint string
   */
  private static String buildCheckpointStr(final String topic,
                                           final HashMap checkpoint) {
    final StringBuilder checkpointStr = new StringBuilder();
    checkpointStr.append(topic);
    for (int i = 0; i < checkpoint.size(); ++i) {
      checkpointStr.append(",").append(i).append(":").append(checkpoint.get(i));
    }
    return checkpointStr.toString();
  }

  /**
   * List file status recursively.
   *
   * @param curPath Current Path
   * @param filter  PathFilter
   * @return All file status match kafka connect naming convention
   * @throws IOException
   */
  private ArrayList listAllFileStatus(Path curPath,
                                                  KafkaConnectPathFilter filter) throws IOException {
    ArrayList allFileStatus = new ArrayList<>();
    FileStatus[] fileStatus = this.fs.listStatus(curPath);
    for (FileStatus status : fileStatus) {
      if (status.isDirectory() && filter.acceptDir(status.getPath())) {
        allFileStatus.addAll(listAllFileStatus(status.getPath(), filter));
      } else {
        if (filter.accept(status.getPath())) {
          allFileStatus.add(status);
        }
      }
    }
    return allFileStatus;
  }

  @Override
  public String getCheckpoint() throws HoodieException {
    final KafkaConnectPathFilter filter = new KafkaConnectPathFilter();
    ArrayList fileStatus;
    try {
      fileStatus = listAllFileStatus(this.path, filter);
    } catch (IOException e) {
      throw new HoodieException(e.toString());
    }
    if (fileStatus.size() == 0) {
      throw new HoodieException("No valid Kafka Connect Hdfs file found under:" + this.path.getName());
    }
    final String topic = fileStatus.get(0).getPath().getName().split(FILENAME_SEPARATOR)[0];
    int maxPartition = -1;
    final HashMap checkpointMap = new HashMap<>();
    for (final FileStatus status : fileStatus) {
      final String filename = status.getPath().getName();
      final String[] groups = filename.split(FILENAME_SEPARATOR);
      final int partition = Integer.parseInt(groups[1]);
      final int offsetUpper = Integer.parseInt(groups[3]);
      maxPartition = Math.max(maxPartition, partition);
      if (checkpointMap.containsKey(partition)) {
        checkpointMap.put(partition, Math.max(checkpointMap.get(partition), offsetUpper));
      } else {
        checkpointMap.put(partition, offsetUpper);
      }
    }
    if (checkpointMap.size() != maxPartition + 1) {
      throw new HoodieException("Missing partition from the file scan, "
          + "max partition found(start from 0): "
          + maxPartition
          + " total partitions number appear in "
          + this.path.getName()
          + " is: "
          + checkpointMap.size()
          + " total partitions number expected: "
          + (maxPartition + 1));
    }
    return buildCheckpointStr(topic, checkpointMap);
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy