
org.apache.flink.table.utils.PartitionPathUtils Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.table.utils;
import org.apache.flink.annotation.Internal;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.core.fs.FileStatus;
import org.apache.flink.core.fs.FileSystem;
import org.apache.flink.core.fs.Path;
import org.apache.flink.table.api.TableException;
import org.apache.flink.table.data.GenericRowData;
import org.apache.flink.table.data.StringData;
import org.apache.flink.table.data.TimestampData;
import org.apache.flink.table.types.DataType;
import org.apache.flink.table.types.logical.LogicalTypeRoot;
import java.io.IOException;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.BitSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/** Utils for file system. */
@Internal
public class PartitionPathUtils {
private static final Pattern PARTITION_NAME_PATTERN = Pattern.compile("([^/]+)=([^/]+)");
private static final BitSet CHAR_TO_ESCAPE = new BitSet(128);
static {
for (char c = 0; c < ' '; c++) {
CHAR_TO_ESCAPE.set(c);
}
/*
* ASCII 01-1F are HTTP control characters that need to be escaped.
* \u000A and \u000D are \n and \r, respectively.
*/
char[] clist =
new char[] {
'\u0001', '\u0002', '\u0003', '\u0004', '\u0005', '\u0006', '\u0007', '\u0008',
'\u0009', '\n', '\u000B', '\u000C', '\r', '\u000E', '\u000F', '\u0010',
'\u0011', '\u0012', '\u0013', '\u0014', '\u0015', '\u0016', '\u0017', '\u0018',
'\u0019', '\u001A', '\u001B', '\u001C', '\u001D', '\u001E', '\u001F', '"', '#',
'%', '\'', '*', '/', ':', '=', '?', '\\', '\u007F', '{', '[', ']', '^'
};
for (char c : clist) {
CHAR_TO_ESCAPE.set(c);
}
}
private static boolean needsEscaping(char c) {
return c < CHAR_TO_ESCAPE.size() && CHAR_TO_ESCAPE.get(c);
}
/**
* Make partition path from partition spec.
*
* @param partitionSpec The partition spec.
* @return An escaped, valid partition name.
*/
public static String generatePartitionPath(LinkedHashMap partitionSpec) {
if (partitionSpec.isEmpty()) {
return "";
}
StringBuilder suffixBuf = new StringBuilder();
int i = 0;
for (Map.Entry e : partitionSpec.entrySet()) {
if (i > 0) {
suffixBuf.append(Path.SEPARATOR);
}
suffixBuf.append(escapePathName(e.getKey()));
suffixBuf.append('=');
suffixBuf.append(escapePathName(e.getValue()));
i++;
}
suffixBuf.append(Path.SEPARATOR);
return suffixBuf.toString();
}
/**
* Escapes a path name.
*
* @param path The path to escape.
* @return An escaped path name.
*/
private static String escapePathName(String path) {
if (path == null || path.length() == 0) {
throw new TableException("Path should not be null or empty: " + path);
}
StringBuilder sb = new StringBuilder();
for (int i = 0; i < path.length(); i++) {
char c = path.charAt(i);
if (needsEscaping(c)) {
sb.append('%');
sb.append(String.format("%1$02X", (int) c));
} else {
sb.append(c);
}
}
return sb.toString();
}
/**
* Make partition values from path.
*
* @param currPath partition file path.
* @return Sequential partition specs.
*/
public static List extractPartitionValues(Path currPath) {
return new ArrayList<>(extractPartitionSpecFromPath(currPath).values());
}
/**
* Make partition spec from path.
*
* @param currPath partition file path.
* @return Sequential partition specs.
*/
public static LinkedHashMap extractPartitionSpecFromPath(Path currPath) {
LinkedHashMap fullPartSpec = new LinkedHashMap<>();
List kvs = new ArrayList<>();
do {
String component = currPath.getName();
Matcher m = PARTITION_NAME_PATTERN.matcher(component);
if (m.matches()) {
String k = unescapePathName(m.group(1));
String v = unescapePathName(m.group(2));
String[] kv = new String[2];
kv[0] = k;
kv[1] = v;
kvs.add(kv);
}
currPath = currPath.getParent();
} while (currPath != null && !currPath.getName().isEmpty());
// reverse the list since we checked the part from leaf dir to table's base dir
for (int i = kvs.size(); i > 0; i--) {
fullPartSpec.put(kvs.get(i - 1)[0], kvs.get(i - 1)[1]);
}
return fullPartSpec;
}
public static String unescapePathName(String path) {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < path.length(); i++) {
char c = path.charAt(i);
if (c == '%' && i + 2 < path.length()) {
int code = -1;
try {
code = Integer.parseInt(path.substring(i + 1, i + 3), 16);
} catch (Exception ignored) {
}
if (code >= 0) {
sb.append((char) code);
i += 2;
continue;
}
}
sb.append(c);
}
return sb.toString();
}
/** List file status without hidden files. */
public static FileStatus[] listStatusWithoutHidden(FileSystem fs, Path dir) throws IOException {
FileStatus[] statuses = fs.listStatus(dir);
if (statuses == null) {
return null;
}
return Arrays.stream(statuses)
.filter(fileStatus -> !isHiddenFile(fileStatus))
.toArray(FileStatus[]::new);
}
/**
* Search all partitions in this path.
*
* @param path search path.
* @param partitionNumber partition number, it will affect path structure.
* @return all partition specs to its path.
*/
public static List, Path>> searchPartSpecAndPaths(
FileSystem fs, Path path, int partitionNumber) {
FileStatus[] generatedParts = getFileStatusRecurse(path, partitionNumber, fs);
List, Path>> ret = new ArrayList<>();
for (FileStatus part : generatedParts) {
// ignore hidden file
if (isHiddenFile(part)) {
continue;
}
ret.add(new Tuple2<>(extractPartitionSpecFromPath(part.getPath()), part.getPath()));
}
return ret;
}
/**
* Extract partition value from path and fill to record.
*
* @param fieldNames record field names.
* @param fieldTypes record field types.
* @param selectFields the selected fields.
* @param partitionKeys the partition field names.
* @param path the file path that the partition located.
* @param defaultPartValue default value of partition field.
* @return the filled record.
*/
public static GenericRowData fillPartitionValueForRecord(
String[] fieldNames,
DataType[] fieldTypes,
int[] selectFields,
List partitionKeys,
Path path,
String defaultPartValue) {
GenericRowData record = new GenericRowData(selectFields.length);
LinkedHashMap partSpec =
PartitionPathUtils.extractPartitionSpecFromPath(path);
for (int i = 0; i < selectFields.length; i++) {
int selectField = selectFields[i];
String name = fieldNames[selectField];
if (partitionKeys.contains(name)) {
String value = partSpec.get(name);
value = defaultPartValue.equals(value) ? null : value;
record.setField(
i,
PartitionPathUtils.convertStringToInternalValue(
value, fieldTypes[selectField]));
}
}
return record;
}
/**
* Restore partition value from string and type.
*
* @param valStr string partition value.
* @param type type of partition field.
* @return partition value.
*/
public static Object convertStringToInternalValue(String valStr, DataType type) {
if (valStr == null) {
return null;
}
LogicalTypeRoot typeRoot = type.getLogicalType().getTypeRoot();
switch (typeRoot) {
case CHAR:
case VARCHAR:
return StringData.fromString(valStr);
case BOOLEAN:
return Boolean.parseBoolean(valStr);
case TINYINT:
return Byte.parseByte(valStr);
case SMALLINT:
return Short.parseShort(valStr);
case INTEGER:
return Integer.parseInt(valStr);
case BIGINT:
return Long.parseLong(valStr);
case FLOAT:
return Float.parseFloat(valStr);
case DOUBLE:
return Double.parseDouble(valStr);
case DATE:
return (int) LocalDate.parse(valStr).toEpochDay();
case TIMESTAMP_WITHOUT_TIME_ZONE:
return TimestampData.fromLocalDateTime(LocalDateTime.parse(valStr));
default:
throw new RuntimeException(
String.format(
"Can not convert %s to type %s for partition value", valStr, type));
}
}
private static FileStatus[] getFileStatusRecurse(Path path, int expectLevel, FileSystem fs) {
ArrayList result = new ArrayList<>();
try {
FileStatus fileStatus = fs.getFileStatus(path);
listStatusRecursively(fs, fileStatus, 0, expectLevel, result);
} catch (IOException ignore) {
return new FileStatus[0];
}
return result.toArray(new FileStatus[0]);
}
private static void listStatusRecursively(
FileSystem fs,
FileStatus fileStatus,
int level,
int expectLevel,
List results)
throws IOException {
if (expectLevel == level) {
results.add(fileStatus);
return;
}
if (fileStatus.isDir()) {
for (FileStatus stat : fs.listStatus(fileStatus.getPath())) {
listStatusRecursively(fs, stat, level + 1, expectLevel, results);
}
}
}
private static boolean isHiddenFile(FileStatus fileStatus) {
String name = fileStatus.getPath().getName();
return name.startsWith("_") || name.startsWith(".");
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy