All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.netease.arctic.data.file.FileNameGenerator Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.netease.arctic.data.file;

import com.netease.arctic.ams.api.Constants;
import com.netease.arctic.data.DataFileType;
import com.netease.arctic.data.DataTreeNode;
import com.netease.arctic.data.DefaultKeyedFile;
import com.netease.arctic.io.writer.TaskWriterKey;
import com.netease.arctic.utils.IdGenerator;
import com.netease.arctic.utils.TableFileUtils;
import com.netease.arctic.shade.org.apache.iceberg.FileFormat;

import java.util.concurrent.atomic.AtomicLong;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * File name pattern:${tree_node_id}-${file_type}-${transaction_id}-${partition_id}-${task_id}-{operation_id}-{count}
 * 
    *
  • tree_node_id: id of {@link com.netease.arctic.data.DataTreeNode} the file belong
  • *
  • file_type: short name of file's {@link com.netease.arctic.data.DataFileType}
  • *
  • transaction_id: id of transaction the file added
  • *
  • partition_id: id of partitioned data in parallel engine like spark & flink
  • *
  • task_id: id of write task within partition
  • *
  • operation_id: a random id to avoid duplicated file name
  • *
  • count: auto increment count within writer
  • *
* like: *
    *
  • 1-B-100-0-0-4217271085623029157-0.parquet
  • *
  • 4-ED-101-0-0-9009257362994691056-1.parquet
  • *
  • 4-I-101-0-0-9009257362994691056-2.parquet
  • *
* */ public class FileNameGenerator { private static final String KEYED_FILE_NAME_PATTERN_STRING = "(\\d+)-(\\w+)-(\\d+)-(\\d+)-(\\d+)-.*"; private static final Pattern KEYED_FILE_NAME_PATTERN = Pattern.compile(KEYED_FILE_NAME_PATTERN_STRING); private static final String FORMAT = "%d-%s-%d-%05d-%d-%s-%05d"; public static final DefaultKeyedFile.FileMeta DEFAULT_BASE_FILE_META = new DefaultKeyedFile.FileMeta(0, DataFileType.BASE_FILE, DataTreeNode.ROOT); private final FileFormat fileFormat; private final int partitionId; private final long taskId; private final long transactionId; // uuid avoid duplicated file name private final String operationId = IdGenerator.randomId() + ""; private final AtomicLong fileCount = new AtomicLong(0); public FileNameGenerator( FileFormat fileFormat, int partitionId, Long taskId, Long transactionId) { this.fileFormat = fileFormat; this.partitionId = partitionId; this.taskId = taskId; this.transactionId = transactionId == null ? 0 : transactionId; } public String fileName(TaskWriterKey key) { return fileFormat.addExtension( String.format(FORMAT, key.getTreeNode().getId(), key.getFileType().shortName(), transactionId, partitionId, taskId, operationId, fileCount.incrementAndGet())); } /** * Parse FileMeta for ChangeStore. * Flink write transactionId as 0. * If it is not arctic file format or transactionId from path is 0, we set transactionId as iceberg sequenceNumber. * * @param path file path * @param sequenceNumber iceberg sequenceNumber * @return fileMeta */ public static DefaultKeyedFile.FileMeta parseChange(String path, long sequenceNumber) { String fileName = TableFileUtils.getFileName(path); Matcher matcher = KEYED_FILE_NAME_PATTERN.matcher(fileName); if (matchArcticFileFormat(matcher)) { DataFileType type; long transactionId; long nodeId = Long.parseLong(matcher.group(1)); type = DataFileType.ofShortName(matcher.group(2)); transactionId = Long.parseLong(matcher.group(3)); transactionId = transactionId == 0 ? sequenceNumber : transactionId; DataTreeNode node = DataTreeNode.ofId(nodeId); return new DefaultKeyedFile.FileMeta(transactionId, type, node); } else { return new DefaultKeyedFile.FileMeta(sequenceNumber, DataFileType.INSERT_FILE, DataTreeNode.ROOT); } } /** * Parse FileMeta for BaseStore. * Path writen by hive can not be parsed by arctic file format, we set it to be DEFAULT_BASE_FILE_META. * * @param path - path * @return fileMeta */ public static DefaultKeyedFile.FileMeta parseBase(String path) { String fileName = TableFileUtils.getFileName(path); Matcher matcher = KEYED_FILE_NAME_PATTERN.matcher(fileName); if (matchArcticFileFormat(matcher)) { long nodeId = Long.parseLong(matcher.group(1)); DataFileType type = DataFileType.ofShortName(matcher.group(2)); if (type == DataFileType.INSERT_FILE) { type = DataFileType.BASE_FILE; } long transactionId = Long.parseLong(matcher.group(3)); DataTreeNode node = DataTreeNode.ofId(nodeId); return new DefaultKeyedFile.FileMeta(transactionId, type, node); } else { return DEFAULT_BASE_FILE_META; } } /** * Parse file type for ChangeStore. * * @param path - path * @return file type, return INSERT_FILE if is not arctic file format */ public static DataFileType parseFileTypeForChange(String path) { String fileName = TableFileUtils.getFileName(path); Matcher matcher = KEYED_FILE_NAME_PATTERN.matcher(fileName); if (matchArcticFileFormat(matcher)) { return DataFileType.ofShortName(matcher.group(2)); } else { return DataFileType.INSERT_FILE; } } /** * Parse file type for BaseStore. * * @param path - path * @return file type, return BASE_FILE if is not arctic file format */ public static DataFileType parseFileTypeForBase(String path) { String fileName = TableFileUtils.getFileName(path); Matcher matcher = KEYED_FILE_NAME_PATTERN.matcher(fileName); if (matchArcticFileFormat(matcher)) { DataFileType type; type = DataFileType.ofShortName(matcher.group(2)); if (type == DataFileType.INSERT_FILE) { type = DataFileType.BASE_FILE; } return type; } else { return DataFileType.BASE_FILE; } } /** * Parse file type. * * @param path - path * @param tableType - table type, base/change * @return file type */ public static DataFileType parseFileType(String path, String tableType) { if (Constants.INNER_TABLE_CHANGE.equals(tableType)) { return parseFileTypeForChange(path); } else if (Constants.INNER_TABLE_BASE.equals(tableType)) { return parseFileTypeForBase(path); } else { throw new IllegalArgumentException("unknown tableType " + tableType); } } /** * Parse keyed file node id from file name. * * @param path path * @return node, return node(0,0) if path is not arctic file format. */ public static DataTreeNode parseFileNodeFromFileName(String path) { path = TableFileUtils.getFileName(path); Matcher matcher = KEYED_FILE_NAME_PATTERN.matcher(path); if (matchArcticFileFormat(matcher)) { long nodeId = Long.parseLong(matcher.group(1)); return DataTreeNode.ofId(nodeId); } else { return DataTreeNode.ROOT; } } /** * Parse transaction id from file name. * * @param path path * @return transactionId, return 0 if path is not arctic file format. */ public static long parseTransactionId(String path) { String fileName = TableFileUtils.getFileName(path); Matcher matcher = KEYED_FILE_NAME_PATTERN.matcher(fileName); if (matchArcticFileFormat(matcher)) { return Long.parseLong(matcher.group(3)); } else { return 0L; } } /** * Parse transaction id of change file. * * @param path path * @return transactionId, return 0 if path is not arctic file format. */ public static long parseChangeTransactionId(String path, long sequenceNumber) { long transactionId = parseTransactionId(path); return transactionId == 0 ? sequenceNumber : transactionId; } /** * Check if is arctic file format. * * @param path - path * @return true if is arctic file format */ public static boolean isArcticFileFormat(String path) { String fileName = TableFileUtils.getFileName(path); Matcher matcher = KEYED_FILE_NAME_PATTERN.matcher(fileName); return matchArcticFileFormat(matcher); } private static boolean matchArcticFileFormat(Matcher fileNameMatcher) { return fileNameMatcher.matches() && validFileType(fileNameMatcher.group(2)); } private static boolean validFileType(String typeName) { try { DataFileType.ofShortName(typeName); return true; } catch (IllegalArgumentException e) { return false; } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy