All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.table.lookup.HoodieLookupFunction Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.table.lookup;

import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.configuration.HadoopConfigurations;
import org.apache.hudi.util.StreamerUtil;

import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.common.typeutils.TypeSerializer;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.table.data.GenericRowData;
import org.apache.flink.table.data.RowData;
import org.apache.flink.table.functions.FunctionContext;
import org.apache.flink.table.functions.TableFunction;
import org.apache.flink.table.runtime.typeutils.InternalSerializers;
import org.apache.flink.table.runtime.typeutils.InternalTypeInfo;
import org.apache.flink.table.types.logical.RowType;
import org.apache.flink.util.FlinkRuntimeException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.time.Duration;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * Lookup function for Hoodie dimension table.
 *
 * 

Note: reference Flink FileSystemLookupFunction to avoid additional connector jar dependencies. */ public class HoodieLookupFunction extends TableFunction { private static final Logger LOG = LoggerFactory.getLogger(HoodieLookupFunction.class); // the max number of retries before throwing exception, in case of failure to load the table // into cache private static final int MAX_RETRIES = 3; // interval between retries private static final Duration RETRY_INTERVAL = Duration.ofSeconds(10); private final HoodieLookupTableReader partitionReader; private final RowData.FieldGetter[] lookupFieldGetters; private final Duration reloadInterval; private final TypeSerializer serializer; private final RowType rowType; // cache for lookup data private transient Map> cache; // timestamp when cache expires private transient long nextLoadTime; private transient HoodieTableMetaClient metaClient; private transient HoodieInstant currentCommit; private final Configuration conf; public HoodieLookupFunction( HoodieLookupTableReader partitionReader, RowType rowType, int[] lookupKeys, Duration reloadInterval, Configuration conf) { this.partitionReader = partitionReader; this.rowType = rowType; this.lookupFieldGetters = new RowData.FieldGetter[lookupKeys.length]; for (int i = 0; i < lookupKeys.length; i++) { lookupFieldGetters[i] = RowData.createFieldGetter(rowType.getTypeAt(lookupKeys[i]), lookupKeys[i]); } this.reloadInterval = reloadInterval; this.serializer = InternalSerializers.create(rowType); this.conf = conf; } @Override public void open(FunctionContext context) throws Exception { super.open(context); cache = new HashMap<>(); nextLoadTime = -1L; org.apache.hadoop.conf.Configuration hadoopConf = HadoopConfigurations.getHadoopConf(conf); metaClient = StreamerUtil.metaClientForReader(conf, hadoopConf); } @Override public TypeInformation getResultType() { return InternalTypeInfo.of(rowType); } public void eval(Object... values) { checkCacheReload(); RowData lookupKey = GenericRowData.of(values); List matchedRows = cache.get(lookupKey); if (matchedRows != null) { for (RowData matchedRow : matchedRows) { collect(matchedRow); } } } private void checkCacheReload() { if (nextLoadTime > System.currentTimeMillis()) { return; } if (nextLoadTime > 0) { LOG.info( "Lookup join cache has expired after {} minute(s), reloading", reloadInterval.toMinutes()); } else { LOG.info("Populating lookup join cache"); } HoodieActiveTimeline latestCommit = metaClient.reloadActiveTimeline(); Option latestCommitInstant = latestCommit.getCommitsTimeline().lastInstant(); if (latestCommit.empty()) { LOG.info("No commit instant found currently."); return; } // Determine whether to reload data by comparing instant if (currentCommit != null && latestCommitInstant.get().equals(currentCommit)) { LOG.info("Ignore loading data because the commit instant " + currentCommit + " has not changed."); return; } int numRetry = 0; while (true) { cache.clear(); try { long count = 0; GenericRowData reuse = new GenericRowData(rowType.getFieldCount()); partitionReader.open(); RowData row; while ((row = partitionReader.read(reuse)) != null) { count++; RowData rowData = serializer.copy(row); RowData key = extractLookupKey(rowData); List rows = cache.computeIfAbsent(key, k -> new ArrayList<>()); rows.add(rowData); } partitionReader.close(); nextLoadTime = System.currentTimeMillis() + reloadInterval.toMillis(); LOG.info("Loaded {} row(s) into lookup join cache", count); return; } catch (Exception e) { if (numRetry >= MAX_RETRIES) { throw new FlinkRuntimeException( String.format( "Failed to load table into cache after %d retries", numRetry), e); } numRetry++; long toSleep = numRetry * RETRY_INTERVAL.toMillis(); LOG.warn( String.format( "Failed to load table into cache, will retry in %d seconds", toSleep / 1000), e); try { Thread.sleep(toSleep); } catch (InterruptedException ex) { LOG.warn("Interrupted while waiting to retry failed cache load, aborting"); throw new FlinkRuntimeException(ex); } } } } private RowData extractLookupKey(RowData row) { GenericRowData key = new GenericRowData(lookupFieldGetters.length); for (int i = 0; i < lookupFieldGetters.length; i++) { key.setField(i, lookupFieldGetters[i].getFieldOrNull(row)); } return key; } @Override public void close() throws Exception { // no operation } @VisibleForTesting public Duration getReloadInterval() { return reloadInterval; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy