Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
org.apache.flink.lakesoul.source.LakeSoulSource Maven / Gradle / Ivy
// SPDX-FileCopyrightText: 2023 LakeSoul Contributors
//
// SPDX-License-Identifier: Apache-2.0
package org.apache.flink.lakesoul.source;
import com.dmetasoul.lakesoul.meta.DBUtil;
import com.dmetasoul.lakesoul.meta.DataFileInfo;
import com.dmetasoul.lakesoul.meta.DataOperation;
import com.dmetasoul.lakesoul.meta.LakeSoulOptions;
import com.dmetasoul.lakesoul.meta.entity.TableInfo;
import org.apache.flink.api.connector.source.*;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.core.fs.Path;
import org.apache.flink.core.io.SimpleVersionedSerializer;
import org.apache.flink.lakesoul.tool.FlinkUtil;
import org.apache.flink.lakesoul.tool.LakeSoulSinkOptions;
import org.apache.flink.lakesoul.types.TableId;
import org.apache.flink.table.data.RowData;
import org.apache.flink.table.types.logical.RowType;
import org.apache.parquet.filter2.predicate.FilterPredicate;
import javax.annotation.Nullable;
import java.time.LocalDateTime;
import java.time.ZoneId;
import java.time.format.DateTimeFormatter;
import java.util.*;
public class LakeSoulSource implements Source {
TableId tableId;
RowType rowType;
RowType rowTypeWithPk;
boolean isStreaming;
List pkColumns;
Map optionParams;
@Nullable
List> remainingPartitions;
@Nullable
FilterPredicate filter;
public LakeSoulSource(TableId tableId,
RowType rowType,
RowType rowTypeWithPk,
boolean isStreaming,
List pkColumns,
Map optionParams,
@Nullable List> remainingPartitions,
@Nullable FilterPredicate filter) {
this.tableId = tableId;
this.rowType = rowType;
this.rowTypeWithPk = rowTypeWithPk;
this.isStreaming = isStreaming;
this.pkColumns = pkColumns;
this.optionParams = optionParams;
this.remainingPartitions = remainingPartitions;
this.filter = filter;
}
@Override
public Boundedness getBoundedness() {
if (this.isStreaming) {
return Boundedness.CONTINUOUS_UNBOUNDED;
} else {
return Boundedness.BOUNDED;
}
}
@Override
public SourceReader createReader(SourceReaderContext readerContext) throws Exception {
Configuration conf = Configuration.fromMap(optionParams);
conf.addAll(readerContext.getConfiguration());
return new LakeSoulSourceReader(
() -> new LakeSoulSplitReader(conf,
this.rowType,
this.rowTypeWithPk,
this.pkColumns,
this.isStreaming,
this.optionParams.getOrDefault(LakeSoulSinkOptions.CDC_CHANGE_COLUMN, ""),
this.filter),
new LakeSoulRecordEmitter(),
readerContext.getConfiguration(),
readerContext);
}
@Override
public SplitEnumerator createEnumerator(
SplitEnumeratorContext enumContext) {
TableInfo tif = DataOperation.dbManager().getTableInfoByNameAndNamespace(tableId.table(),
tableId.schema());
List readStartTimestampWithTimeZone =
Arrays.asList(optionParams.getOrDefault(LakeSoulOptions.READ_START_TIME(), ""),
optionParams.getOrDefault(LakeSoulOptions.TIME_ZONE(), ""));
String readType = optionParams.getOrDefault(LakeSoulOptions.READ_TYPE(), "");
if (this.isStreaming) {
String partDesc = optionParams.getOrDefault(LakeSoulOptions.PARTITION_DESC(), "");
if (partDesc.isEmpty()) {
if (remainingPartitions != null && !remainingPartitions.isEmpty()) {
// use remaining partition
if (remainingPartitions.size() > 1) {
throw new RuntimeException("Streaming read allows only one specified partition," +
" or no specified partition to incrementally read entire table");
}
partDesc = DBUtil.formatPartitionDesc(remainingPartitions.get(0));
}
}
return new LakeSoulDynamicSplitEnumerator(enumContext,
new LakeSoulDynSplitAssigner(optionParams.getOrDefault(LakeSoulOptions.HASH_BUCKET_NUM(), "-1")),
Long.parseLong(optionParams.getOrDefault(LakeSoulOptions.DISCOVERY_INTERVAL(), "30000")),
convertTimeFormatWithTimeZone(readStartTimestampWithTimeZone),
tif.getTableId(),
partDesc,
optionParams.getOrDefault(LakeSoulOptions.HASH_BUCKET_NUM(), "-1"));
} else {
return staticSplitEnumerator(enumContext,
tif,
readStartTimestampWithTimeZone,
readType);
}
}
private LakeSoulStaticSplitEnumerator staticSplitEnumerator(SplitEnumeratorContext enumContext,
TableInfo tif,
List readStartTimestampWithTimeZone,
String readType) {
List readEndTimestampWithTimeZone =
Arrays.asList(optionParams.getOrDefault(LakeSoulOptions.READ_END_TIME(), ""),
optionParams.getOrDefault(LakeSoulOptions.TIME_ZONE(), ""));
List dfinfos;
if (readType.equals("") || readType.equals("fullread")) {
dfinfos = Arrays.asList(getTargetDataFileInfo(tif));
} else {
dfinfos = new ArrayList<>();
List partDescs = new ArrayList<>();
String partitionDescOpt = optionParams.getOrDefault(LakeSoulOptions.PARTITION_DESC(), "");
if (partitionDescOpt.isEmpty() && remainingPartitions != null) {
for (Map part : remainingPartitions) {
String desc = DBUtil.formatPartitionDesc(part);
partDescs.add(desc);
}
} else {
partDescs.add(partitionDescOpt);
}
for (String desc : partDescs) {
dfinfos.addAll(Arrays.asList(DataOperation.getIncrementalPartitionDataInfo(tif.getTableId(),
desc,
convertTimeFormatWithTimeZone(readStartTimestampWithTimeZone),
convertTimeFormatWithTimeZone(readEndTimestampWithTimeZone),
readType)));
}
}
int capacity = 100;
ArrayList splits = new ArrayList<>(capacity);
if (!FlinkUtil.isExistHashPartition(tif)) {
for (DataFileInfo dfinfo : dfinfos) {
ArrayList tmp = new ArrayList<>();
tmp.add(new Path(dfinfo.path()));
splits.add(new LakeSoulSplit(String.valueOf(dfinfo.hashCode()),
tmp,
0));
}
} else {
Map>> splitByRangeAndHashPartition =
FlinkUtil.splitDataInfosToRangeAndHashPartition(tif.getTableId(),
dfinfos.toArray(new DataFileInfo[0]));
for (Map.Entry>> entry : splitByRangeAndHashPartition.entrySet()) {
for (Map.Entry> split : entry.getValue().entrySet()) {
splits.add(new LakeSoulSplit(String.valueOf(split.hashCode()),
split.getValue(),
0));
}
}
}
return new LakeSoulStaticSplitEnumerator(enumContext,
new LakeSoulSimpleSplitAssigner(splits));
}
private DataFileInfo[] getTargetDataFileInfo(TableInfo tif) {
return FlinkUtil.getTargetDataFileInfo(tif,
this.remainingPartitions);
}
private long convertTimeFormatWithTimeZone(List readTimestampWithTimeZone) {
String time = readTimestampWithTimeZone.get(0);
String timeZone = readTimestampWithTimeZone.get(1);
if (timeZone.equals("") || !Arrays.asList(TimeZone.getAvailableIDs()).contains(timeZone)) {
timeZone = TimeZone.getDefault().getID();
}
long readTimeStamp = 0;
if (!time.equals("")) {
readTimeStamp = LocalDateTime.parse(time,
DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"))
.atZone(ZoneId.of(timeZone)).toInstant().toEpochMilli();
}
return readTimeStamp;
}
@Override
public SplitEnumerator restoreEnumerator(
SplitEnumeratorContext enumContext,
LakeSoulPendingSplits checkpoint) throws Exception {
return new LakeSoulDynamicSplitEnumerator(enumContext,
new LakeSoulDynSplitAssigner(checkpoint.getSplits(),
String.valueOf(checkpoint.getHashBucketNum())),
checkpoint.getDiscoverInterval(),
checkpoint.getLastReadTimestamp(),
checkpoint.getTableid(),
checkpoint.getParDesc(),
String.valueOf(checkpoint.getHashBucketNum()));
}
@Override
public SimpleVersionedSerializer getSplitSerializer() {
return new SimpleLakeSoulSerializer();
}
@Override
public SimpleVersionedSerializer getEnumeratorCheckpointSerializer() {
return new SimpleLakeSoulPendingSplitsSerializer();
}
}