Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
com.alibaba.ververica.connectors.odps.dim.ODPSCachedRowFetcher Maven / Gradle / Ivy
package com.alibaba.ververica.connectors.odps.dim;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.java.typeutils.ResultTypeQueryable;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.table.api.TableException;
import org.apache.flink.table.api.TableSchema;
import org.apache.flink.table.data.GenericRowData;
import org.apache.flink.table.data.RowData;
import org.apache.flink.table.runtime.typeutils.RowDataSerializer;
import org.apache.flink.table.runtime.typeutils.RowDataTypeInfo;
import org.apache.flink.table.types.logical.LogicalType;
import org.apache.flink.table.types.logical.RowType;
import org.apache.flink.util.Collector;
import org.apache.flink.util.FlinkRuntimeException;
import com.alibaba.ververica.connectors.common.dim.DimJoinFetcher;
import com.alibaba.ververica.connectors.common.dim.cache.AllCache;
import com.alibaba.ververica.connectors.common.dim.cache.CacheConfig;
import com.alibaba.ververica.connectors.common.dim.reload.CacheAllReloadConf;
import com.alibaba.ververica.connectors.common.dim.reload.SerializableRunnable;
import com.alibaba.ververica.connectors.common.errorcode.ConnectorErrors;
import com.alibaba.ververica.connectors.common.exception.ErrorUtils;
import com.alibaba.ververica.connectors.common.util.DateUtil;
import com.alibaba.ververica.connectors.odps.OdpsConf;
import com.alibaba.ververica.connectors.odps.schema.ODPSColumn;
import com.alibaba.ververica.connectors.odps.type.ODPSType;
import com.alibaba.ververica.connectors.odps.util.OdpsMetadataProvider;
import com.alibaba.ververica.connectors.odps.util.OdpsUtils;
import com.alibaba.ververica.connectors.odps.util.PartitionConditionParser;
import com.aliyun.odps.Column;
import com.aliyun.odps.Odps;
import com.aliyun.odps.Partition;
import com.aliyun.odps.PartitionSpec;
import com.aliyun.odps.Table;
import com.aliyun.odps.data.Record;
import com.aliyun.odps.data.RecordReader;
import com.aliyun.odps.tunnel.TableTunnel;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.List;
/**
* Row Fetcher for ODPS table.
*/
public class ODPSCachedRowFetcher extends DimJoinFetcher
implements FlatMapFunction, ResultTypeQueryable {
private static final Logger LOG = LoggerFactory.getLogger(ODPSCachedRowFetcher.class);
private final String sqlTableName;
private final OdpsConf odpsConf;
private final String tableName;
private final RowType rowType;
private final ODPSColumn[] selectedColumns;
protected final int fieldLength;
private final boolean isPartitionedTable;
protected final List sourceKeys;
protected final List targetKeys;
private final LogicalType[] keyTypes;
private int maxRetries = 10;
private List specificPartitions;
private Long maxRowCount;
private RowDataSerializer serializer;
public ODPSCachedRowFetcher(
String sqlTableName,
OdpsConf odpsConf,
String tableName,
RowType rowType,
ODPSColumn[] selectedColumns,
String[] lookupKeys,
boolean isPartitionedTable,
TableSchema tableSchema,
CacheConfig cacheConfig,
List specificPartitions,
Long maxRowCount) {
super(sqlTableName, lookupKeys, cacheConfig.getCacheStrategy());
this.sqlTableName = sqlTableName;
this.odpsConf = odpsConf;
this.tableName = tableName;
this.rowType = rowType;
this.selectedColumns = selectedColumns;
this.isPartitionedTable = isPartitionedTable;
CacheAllReloadConf reloadConf = new CacheAllReloadConf(
cacheConfig.getTimeRangeBlacklist(),
cacheConfig.getCacheScanLimit(),
cacheConfig.getCacheStrategy().getTtlMs());
setAllCacheReloadRunner(new ReloadCacheRunner(), reloadConf);
this.sourceKeys = new ArrayList<>();
this.targetKeys = new ArrayList<>();
this.serializer = new RowDataSerializer(null, rowType);
this.fieldLength = selectedColumns.length;
this.keyTypes = new LogicalType[lookupKeys.length];
String[] fieldNames = tableSchema.getFieldNames();
for (int i = 0; i < lookupKeys.length; i++) {
sourceKeys.add(i);
int targetIdx = getColumnIndex(lookupKeys[i], fieldNames);
if (targetIdx < 0) {
throw new TableException("Column: " + lookupKeys[i] + " doesn't exists.");
}
targetKeys.add(targetIdx);
keyTypes[i] = rowType.getTypeAt(targetIdx);
}
this.specificPartitions = specificPartitions;
this.maxRowCount = maxRowCount;
}
@Override
public void openConnection(Configuration parameters) {
// no need to prepare connection here.
}
@Override
public void closeConnection() {
// no need to close connection here.
}
@Override
public void flatMap(RowData row, Collector collector) throws Exception {
Object key = getSourceKey(row);
if (key == null) {
LOG.debug("Join ODPS on an empty key of row: {}", row);
return;
}
while (!one2oneAllCacheHandler.isLoadedOrThrowException()) {
// one2oneCache is not ready for the first, so sleep a while to wait
Thread.sleep(10);
}
one2oneAllCacheHandler.lock.readLock().lock();
RowData cachedRow;
try {
cachedRow = one2oneAllCacheHandler.get(key);
} finally {
one2oneAllCacheHandler.lock.readLock().unlock();
}
if (cachedRow != null) {
collector.collect(cachedRow);
}
}
@Override
public TypeInformation getProducedType() {
return RowDataTypeInfo.of(rowType);
}
protected Object getSourceKey(RowData source) {
return getKey(source, sourceKeys, keyTypes);
}
protected Object prepareCacheKey(RowData target) {
return getKey(target, targetKeys, keyTypes);
}
private class ReloadCacheRunner extends SerializableRunnable {
@Override
public void run() {
int attemptNum = 1;
AllCache allCache;
allCache = one2oneAllCacheHandler;
while (attemptNum <= maxRetries) {
try {
long currentTime = System.currentTimeMillis();
if (DateUtil.isTimeInRange(reloadConf.timeRangeBlackList, currentTime)) {
if (allCache.isLoaded()) {
LOG.info("Current time {} is in reload black list, so try to reload cache next time.", currentTime);
return;
} else {
LOG.info("Current time {} is in reload black list, but this is the first time to load cache, so still load.", currentTime);
// not return
}
}
LOG.info("Reloading all data from ODPS '{}' ...", sqlTableName);
allCache.initialize();
// reload all data...
long startTime = System.nanoTime();
long readCount = 0L;
// create a new scan every time
TableTunnel tunnel = OdpsUtils.createTableTunnel(odpsConf);
String project = odpsConf.getProject();
PartitionSpec currentPartitionSpec = null;
final TableTunnel.DownloadSession downloadSession;
if (isPartitionedTable) {
if (specificPartitions != null && !specificPartitions.isEmpty()) {
Odps odps = OdpsUtils.initOdps(odpsConf);
Table table = OdpsMetadataProvider.getTable(odps, project, tableName);
final List matchedPartitions = PartitionConditionParser
.filter(table.getPartitions(), specificPartitions);
if (matchedPartitions == null || matchedPartitions.isEmpty()) {
LOG.warn(
"Does not find any match partition for table {}.{}, partition condition is {}! And don't fetch data until next period.",
project, tableName, specificPartitions);
allCache.switchCache();
break;
} else if (matchedPartitions.size() > 1) {
LOG.error(
"Finds multiple matched partitions for table {}.{}, partition condition is {}!",
project, tableName, specificPartitions);
throw new TableException("Only support one partition now when OdpsTable as dimension!");
} else {
currentPartitionSpec = matchedPartitions.get(0).getPartitionSpec();
downloadSession = tunnel
.createDownloadSession(project, tableName, currentPartitionSpec);
}
} else {
LOG.error("Does not specify any partition for table {}.{}.", project, tableName);
throw new TableException("specified partitions cannot be empty for partition table!");
}
} else {
downloadSession = tunnel.createDownloadSession(project, tableName);
}
LOG.info("Session Status is : " + downloadSession.getStatus().toString());
long count = downloadSession.getRecordCount();
LOG.info("RecordCount is: " + count);
if (count <= 0) {
LOG.warn(
"RecordCount is: {} for table {}.{}, partition condition is {}! And don't fetch data until next period.",
count, project, tableName, specificPartitions);
allCache.switchCache();
break;
}
if (count > maxRowCount) {
LOG.error(
"RecordCount is: {} exceed max limitation {} for table {}.{}, partition condition is {}!",
count, maxRowCount, project, tableName, specificPartitions);
ErrorUtils.throwException(
ConnectorErrors.INST.odpsTableExceedMaxRowCountError(
tableName,
currentPartitionSpec != null ? currentPartitionSpec.toString() : null,
String.valueOf(maxRowCount)));
}
List columns = new ArrayList<>();
for (ODPSColumn column : selectedColumns) {
if (column != null && !column.isPartition()) {
columns.add(new Column(column.getName(), column.getType()));
}
}
RecordReader recordReader = downloadSession.openRecordReader(0, count, false, columns);
Record record;
while ((record = recordReader.read()) != null) {
readCount++;
GenericRowData resultRow = new GenericRowData(fieldLength);
// parse to row
for (int idx = 0; idx < fieldLength; idx++) {
ODPSColumn column = selectedColumns[idx];
String columnName = column.getName();
ODPSType odpsType = ODPSType.valueOf(column.getType().name());
if (column.isPartition()) {
odpsType.setRowField(resultRow, idx, currentPartitionSpec.get(columnName));
} else {
odpsType.setRowField(resultRow, idx, record, columnName);
}
}
Object key = prepareCacheKey(resultRow);
// TODO exception when duplicate data on uk ?
one2oneAllCacheHandler.put(key, serializer.copy(resultRow));
}
recordReader.close();
long endTime = System.nanoTime();
LOG.info("Loaded {} rows from ODPS '{}' into cache, used {}ms.",
readCount,
sqlTableName,
(endTime - startTime) / 1000_000);
allCache.switchCache();
break;
} catch (Throwable t) {
// catch all throwable errors
FlinkRuntimeException e = new FlinkRuntimeException("Error happens in reload thread.", t);
if (t instanceof InterruptedException || t instanceof OutOfMemoryError) {
LOG.error("Error happens when scanning all data from ODPS.", e);
allCache.setException(e);
} else {
attemptNum++;
if (attemptNum <= maxRetries) {
LOG.warn(
"Error happens when scanning all data from ODPS, try for the {} time.",
attemptNum,
e);
try {
Thread.sleep(1000 * attemptNum);
} catch (InterruptedException e1) {
LOG.error("Cache reload thread is interrupted", e1);
ErrorUtils.throwException(
"Thread sleep may be interrupted by main thread.",
e1);
}
} else {
LOG.error("Error happens when scanning all data from ODPS.", e);
allCache.setException(e);
}
}
}
}
}
}
}