org.apache.hudi.common.util.InternalSchemaCache Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.common.util;
import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.common.model.HoodieCommitMetadata;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.CommitMetadataSerDe;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.table.timeline.InstantFileNameGenerator;
import org.apache.hudi.common.table.timeline.InstantFileNameParser;
import org.apache.hudi.common.table.timeline.InstantGenerator;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.internal.schema.InternalSchema;
import org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter;
import org.apache.hudi.internal.schema.io.FileBasedInternalSchemaStorageManager;
import org.apache.hudi.internal.schema.utils.InternalSchemaUtils;
import org.apache.hudi.internal.schema.utils.SerDeHelper;
import org.apache.hudi.storage.HoodieStorage;
import org.apache.hudi.storage.StoragePath;
import com.github.benmanes.caffeine.cache.Cache;
import com.github.benmanes.caffeine.cache.Caffeine;
import org.apache.avro.Schema;
import org.apache.hudi.storage.StoragePathInfo;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.List;
import java.util.Set;
import java.util.TreeMap;
import java.util.stream.Collectors;
/**
* An internal cache implementation for managing different version of schemas.
* This is a Global cache; all threads in one container/executor share the same cache.
* A map of (tablePath, HistorySchemas) is maintained.
*/
public class InternalSchemaCache {
private static final Logger LOG = LoggerFactory.getLogger(InternalSchemaCache.class);
// Use segment lock to reduce competition.
// the lock size should be powers of 2 for better hash.
private static Object[] lockList = new Object[16];
static {
for (int i = 0; i < lockList.length; i++) {
lockList[i] = new Object();
}
}
// historySchemas cache maintain a map about (tablePath, HistorySchemas).
// this is a Global cache, all threads in one container/executor share the same cache.
private static final Cache>
HISTORICAL_SCHEMA_CACHE = Caffeine.newBuilder().maximumSize(1000).weakValues().build();
/**
* Search internalSchema based on versionID.
* first step: try to get internalSchema from hoodie commit files, we no need to add lock.
* if we cannot get internalSchema by first step, then we try to get internalSchema from cache.
*
* @param versionID schema version_id need to search
* @param metaClient current hoodie metaClient
* @return internalSchema
*/
public static InternalSchema searchSchemaAndCache(long versionID, HoodieTableMetaClient metaClient) {
Option candidateSchema = getSchemaByReadingCommitFile(versionID, metaClient);
if (candidateSchema.isPresent()) {
return candidateSchema.get();
}
String tablePath = metaClient.getBasePath().toString();
// use segment lock to reduce competition.
synchronized (lockList[tablePath.hashCode() & (lockList.length - 1)]) {
TreeMap historicalSchemas = HISTORICAL_SCHEMA_CACHE.getIfPresent(tablePath);
if (historicalSchemas == null || InternalSchemaUtils.searchSchema(versionID, historicalSchemas) == null) {
historicalSchemas = getHistoricalSchemas(metaClient);
HISTORICAL_SCHEMA_CACHE.put(tablePath, historicalSchemas);
} else {
long maxVersionId = historicalSchemas.keySet().stream().max(Long::compareTo).get();
if (versionID > maxVersionId) {
historicalSchemas = getHistoricalSchemas(metaClient);
HISTORICAL_SCHEMA_CACHE.put(tablePath, historicalSchemas);
}
}
return InternalSchemaUtils.searchSchema(versionID, historicalSchemas);
}
}
private static TreeMap getHistoricalSchemas(HoodieTableMetaClient metaClient) {
TreeMap result = new TreeMap<>();
FileBasedInternalSchemaStorageManager schemasManager = new FileBasedInternalSchemaStorageManager(metaClient);
String historySchemaStr = schemasManager.getHistorySchemaStr();
if (!StringUtils.isNullOrEmpty(historySchemaStr)) {
result = SerDeHelper.parseSchemas(historySchemaStr);
}
return result;
}
private static Option getSchemaByReadingCommitFile(long versionID, HoodieTableMetaClient metaClient) {
try {
HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
List instants = timeline.getInstantsAsStream().filter(f -> f.requestedTime().equals(String.valueOf(versionID))).collect(Collectors.toList());
if (instants.isEmpty()) {
return Option.empty();
}
byte[] data = timeline.getInstantDetails(instants.get(0)).get();
HoodieCommitMetadata metadata = metaClient.getCommitMetadataSerDe().deserialize(instants.get(0), data, HoodieCommitMetadata.class);
String latestInternalSchemaStr = metadata.getMetadata(SerDeHelper.LATEST_SCHEMA);
return SerDeHelper.fromJson(latestInternalSchemaStr);
} catch (Exception e) {
throw new HoodieException("Failed to read schema from commit metadata", e);
}
}
/**
* Get internalSchema and avroSchema for compaction/cluster operation.
*
* @param metaClient current hoodie metaClient
* @param compactionAndClusteringInstant first instant before current compaction/cluster instant
* @return (internalSchemaStrOpt, avroSchemaStrOpt) a pair of InternalSchema/avroSchema
*/
public static Pair
© 2015 - 2025 Weber Informatics LLC | Privacy Policy