org.dinky.shaded.paimon.migrate.FileMetaUtils Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.dinky.shaded.paimon.migrate;
import org.dinky.shaded.paimon.data.BinaryRow;
import org.dinky.shaded.paimon.data.BinaryRowWriter;
import org.dinky.shaded.paimon.data.BinaryWriter;
import org.dinky.shaded.paimon.format.FieldStats;
import org.dinky.shaded.paimon.format.FileFormat;
import org.dinky.shaded.paimon.format.TableStatsExtractor;
import org.dinky.shaded.paimon.fs.FileIO;
import org.dinky.shaded.paimon.fs.FileStatus;
import org.dinky.shaded.paimon.fs.Path;
import org.dinky.shaded.paimon.io.CompactIncrement;
import org.dinky.shaded.paimon.io.DataFileMeta;
import org.dinky.shaded.paimon.io.NewFilesIncrement;
import org.dinky.shaded.paimon.statistics.FieldStatsCollector;
import org.dinky.shaded.paimon.stats.BinaryTableStats;
import org.dinky.shaded.paimon.stats.FieldStatsArraySerializer;
import org.dinky.shaded.paimon.table.AbstractFileStoreTable;
import org.dinky.shaded.paimon.table.Table;
import org.dinky.shaded.paimon.table.sink.CommitMessage;
import org.dinky.shaded.paimon.table.sink.CommitMessageImpl;
import org.dinky.shaded.paimon.types.DataField;
import org.dinky.shaded.paimon.types.RowType;
import org.dinky.shaded.paimon.utils.Pair;
import org.dinky.shaded.paimon.utils.StatsCollectorFactories;
import org.dinky.shaded.paimon.utils.TypeUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.function.Predicate;
import java.util.stream.Collectors;
/** To construct file meta data for external files. */
public class FileMetaUtils {
private static final Logger LOG = LoggerFactory.getLogger(FileMetaUtils.class);
public static List construct(
FileIO fileIO,
String format,
String location,
Table paimonTable,
Predicate filter,
Path dir,
Map rollback)
throws IOException {
List fileStatuses =
Arrays.stream(fileIO.listStatus(new Path(location)))
.filter(s -> !s.isDir())
.filter(filter)
.collect(Collectors.toList());
return fileStatuses.stream()
.map(
status ->
constructFileMeta(
format, status, fileIO, paimonTable, dir, rollback))
.collect(Collectors.toList());
}
public static CommitMessage commitFile(BinaryRow partition, List dataFileMetas) {
return new CommitMessageImpl(
partition,
0,
new NewFilesIncrement(dataFileMetas, Collections.emptyList()),
new CompactIncrement(
Collections.emptyList(), Collections.emptyList(), Collections.emptyList()));
}
// -----------------------------private method---------------------------------------------
private static DataFileMeta constructFileMeta(
String format,
FileStatus fileStatus,
FileIO fileIO,
Table table,
Path dir,
Map rollback) {
try {
FieldStatsCollector.Factory[] factories =
StatsCollectorFactories.createStatsFactories(
((AbstractFileStoreTable) table).coreOptions(),
table.rowType().getFieldNames());
TableStatsExtractor tableStatsExtractor =
FileFormat.getFileFormat(
((AbstractFileStoreTable) table)
.coreOptions()
.toConfiguration(),
format)
.createStatsExtractor(table.rowType(), factories)
.orElseThrow(
() ->
new RuntimeException(
"Can't get table stats extractor for format "
+ format));
Path newPath = renameFile(fileIO, fileStatus.getPath(), dir, format, rollback);
return constructFileMeta(
newPath.getName(),
fileStatus.getLen(),
newPath,
tableStatsExtractor,
fileIO,
table);
} catch (IOException e) {
throw new RuntimeException("error when construct file meta", e);
}
}
private static Path renameFile(
FileIO fileIO, Path originPath, Path newDir, String format, Map rollback)
throws IOException {
String subfix = "." + format;
String fileName = originPath.getName();
String newFileName = fileName.endsWith(subfix) ? fileName : fileName + "." + format;
Path newPath = new Path(newDir, newFileName);
rollback.put(newPath, originPath);
LOG.info("Migration: rename file from " + originPath + " to " + newPath);
fileIO.rename(originPath, newPath);
return newPath;
}
private static DataFileMeta constructFileMeta(
String fileName,
long fileSize,
Path path,
TableStatsExtractor tableStatsExtractor,
FileIO fileIO,
Table table)
throws IOException {
FieldStatsArraySerializer statsArraySerializer =
new FieldStatsArraySerializer(table.rowType());
Pair fileInfo =
tableStatsExtractor.extractWithFileInfo(fileIO, path);
BinaryTableStats stats = statsArraySerializer.toBinary(fileInfo.getLeft());
return DataFileMeta.forAppend(
fileName,
fileSize,
fileInfo.getRight().getRowCount(),
stats,
0,
0,
((AbstractFileStoreTable) table).schema().id());
}
public static BinaryRow writePartitionValue(
RowType partitionRowType,
Map partitionValues,
List valueSetters) {
BinaryRow binaryRow = new BinaryRow(partitionRowType.getFieldCount());
BinaryRowWriter binaryRowWriter = new BinaryRowWriter(binaryRow);
List fields = partitionRowType.getFields();
for (int i = 0; i < fields.size(); i++) {
Object value =
TypeUtils.castFromString(
partitionValues.get(fields.get(i).name()), fields.get(i).type());
valueSetters.get(i).setValue(binaryRowWriter, i, value);
}
binaryRowWriter.complete();
return binaryRow;
}
}