All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.common.table.timeline.LSMTimeline Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.hudi.common.table.timeline;

import org.apache.hudi.common.model.HoodieLSMTimelineManifest;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.util.ArchivedInstantReadSchemas;
import org.apache.hudi.common.util.FileIOUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.storage.StoragePath;
import org.apache.hudi.storage.StoragePathFilter;
import org.apache.hudi.storage.StoragePathInfo;

import org.apache.avro.Schema;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.Collections;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

import static org.apache.hudi.common.util.StringUtils.fromUTF8Bytes;

/**
 * Represents the LSM Timeline for the Hoodie table.
 *
 * 

After several instants are accumulated as a batch on the active timeline, they would be flushed as a parquet file into the LSM timeline. * In general the timeline is comprised with parquet files with LSM style file layout. Each new operation to the timeline yields * a new snapshot version. Theoretically, there could be multiple snapshot versions on the timeline. * *

The LSM Timeline Layout

* *
 *   t111, t112 ... t120 ... ->
 *     \              /
 *        \        /
 *            |
 *            V
 *   t111_t120_0.parquet, t101_t110_0.parquet,...  t11_t20_0.parquet    L0
 *                                  \                    /
 *                                     \              /
 *                                            |
 *                                            V
 *                                    t11_t100_1.parquet                L1
 *
 *      manifest_1, manifest_2, ... manifest_12
 *                                      |
 *                                      V
 *                                  _version_
 * 
* *

The LSM Tree Compaction

* Use the universal compaction strategy, that is: when N(by default 10) number of parquet files exist in the current layer, they are merged and flush as a compacted file in the next layer. * We have no limit for the layer number, assumes there are 10 instants for each file in L0, there could be 100 instants per file in L1, * so 3000 instants could be represented as 3 parquets in L2, it is pretty fast if we apply concurrent read. * *

The benchmark shows 1000 instants reading cost about 10 ms. * *

The Archiver & Reader Snapshot Isolation

* *

In order to make snapshot isolation of the LSM timeline write/read, we add two kinds of metadata files for the LSM tree version management: *

    *
  1. Manifest file: Each new file in layer 0 or each compaction would generate a new manifest file, the manifest file records the valid file handles of the latest snapshot;
  2. *
  3. Version file: A version file is generated right after a new manifest file is formed.
  4. *
* *

The Reader Workflow

*
    *
  • read the latest version;
  • *
  • read the manifest file for valid file handles;
  • *
  • read the data files, probably do a data skipping with the parquet file name max min timestamp.
  • *
* *

The Legacy Files Cleaning and Read Retention

* Only triggers file cleaning after a valid compaction. * *

Clean Strategy

* Keeps only 3 valid snapshot versions for the reader, that means, a file is kept for at lest 3 archival trigger interval, for default configuration, it is 30 instants time span, * which is far longer that the LSM timeline loading time. * *

Instants TTL

* The timeline reader only reads instants of last limited days. We will by default skip the instants from LSM timeline that are generated long time ago. */ public class LSMTimeline { private static final Logger LOG = LoggerFactory.getLogger(LSMTimeline.class); public static final int LSM_TIMELINE_INSTANT_VERSION_1 = 1; private static final String VERSION_FILE_NAME = "_version_"; // _version_ private static final String MANIFEST_FILE_PREFIX = "manifest_"; // manifest_[N] private static final String TEMP_FILE_SUFFIX = ".tmp"; private static final Pattern ARCHIVE_FILE_PATTERN = Pattern.compile("^(\\d+)_(\\d+)_(\\d)\\.parquet"); // ------------------------------------------------------------------------- // Utilities // ------------------------------------------------------------------------- public static Schema getReadSchema(HoodieArchivedTimeline.LoadMode loadMode) { switch (loadMode) { case TIME: return ArchivedInstantReadSchemas.TIMELINE_LSM_READ_SCHEMA_WITH_TIME; case ACTION: return ArchivedInstantReadSchemas.TIMELINE_LSM_READ_SCHEMA_WITH_ACTION; case METADATA: return ArchivedInstantReadSchemas.TIMELINE_LSM_READ_SCHEMA_WITH_METADATA; case PLAN: return ArchivedInstantReadSchemas.TIMELINE_LSM_READ_SCHEMA_WITH_PLAN; case FULL: return ArchivedInstantReadSchemas.TIMELINE_LSM_READ_SCHEMA_AS_FULL; default: throw new AssertionError("Unexpected"); } } /** * Returns whether the given file is located in the filter. */ public static boolean isFileInRange(HoodieArchivedTimeline.TimeRangeFilter filter, String fileName) { String minInstant = getMinInstantTime(fileName); String maxInstant = getMaxInstantTime(fileName); return filter.isInRange(minInstant) || filter.isInRange(maxInstant); } /** * Returns the latest snapshot version. */ public static int latestSnapshotVersion(HoodieTableMetaClient metaClient, StoragePath archivePath) throws IOException { StoragePath versionFilePath = getVersionFilePath(archivePath); if (metaClient.getStorage().exists(versionFilePath)) { try { Option content = FileIOUtils.readDataFromPath(metaClient.getStorage(), versionFilePath); if (content.isPresent()) { return Integer.parseInt(fromUTF8Bytes(content.get())); } } catch (Exception e) { // fallback to manifest file listing. LOG.warn("Error reading version file {}", versionFilePath, e); } } return allSnapshotVersions(metaClient, archivePath).stream().max(Integer::compareTo).orElse(-1); } /** * Returns all the valid snapshot versions. */ public static List allSnapshotVersions(HoodieTableMetaClient metaClient, StoragePath archivePath) throws IOException { if (!metaClient.getStorage().exists(archivePath)) { return Collections.emptyList(); } return metaClient.getStorage().listDirectEntries(archivePath, getManifestFilePathFilter()) .stream() .map(fileStatus -> fileStatus.getPath().getName()) .map(LSMTimeline::getManifestVersion) .collect(Collectors.toList()); } /** * Returns the latest snapshot metadata files. */ public static HoodieLSMTimelineManifest latestSnapshotManifest(HoodieTableMetaClient metaClient, StoragePath archivePath) throws IOException { int latestVersion = latestSnapshotVersion(metaClient, archivePath); return latestSnapshotManifest(metaClient, latestVersion, archivePath); } /** * Reads the file list from the manifest file for the latest snapshot. */ public static HoodieLSMTimelineManifest latestSnapshotManifest(HoodieTableMetaClient metaClient, int latestVersion, StoragePath archivePath) { if (latestVersion < 0) { // there is no valid snapshot of the timeline. return HoodieLSMTimelineManifest.EMPTY; } // read and deserialize the valid files. byte[] content = FileIOUtils.readDataFromPath(metaClient.getStorage(), getManifestFilePath(latestVersion, archivePath)).get(); try { return HoodieLSMTimelineManifest.fromJsonString(fromUTF8Bytes(content), HoodieLSMTimelineManifest.class); } catch (Exception e) { throw new HoodieException("Error deserializing manifest entries", e); } } /** * Returns the full manifest file path with given version number. */ public static StoragePath getManifestFilePath(int snapshotVersion, StoragePath archivePath) { return new StoragePath(archivePath, MANIFEST_FILE_PREFIX + snapshotVersion); } /** * Returns the full version file path with given version number. */ public static StoragePath getVersionFilePath(StoragePath archivePath) { return new StoragePath(archivePath, VERSION_FILE_NAME); } /** * List all the parquet manifest files. */ public static List listAllManifestFiles(HoodieTableMetaClient metaClient, StoragePath archivePath) throws IOException { return metaClient.getStorage().listDirectEntries( archivePath, getManifestFilePathFilter()); } /** * List all the parquet metadata files. */ public static List listAllMetaFiles(HoodieTableMetaClient metaClient, StoragePath archivePath) throws IOException { return metaClient.getStorage().globEntries( new StoragePath(archivePath, "*.parquet")); } /** * Parse the snapshot version from the manifest file name. */ public static int getManifestVersion(String fileName) { return Integer.parseInt(fileName.split("_")[1]); } /** * Parse the layer number from the file name. */ public static int getFileLayer(String fileName) { try { Matcher fileMatcher = ARCHIVE_FILE_PATTERN.matcher(fileName); if (fileMatcher.matches()) { return Integer.parseInt(fileMatcher.group(3)); } } catch (NumberFormatException e) { // log and ignore any format warnings LOG.warn("error getting file layout for archived file: {}", fileName); } // return default value in case of any errors return 0; } /** * Parse the minimum instant time from the file name. */ public static String getMinInstantTime(String fileName) { Matcher fileMatcher = ARCHIVE_FILE_PATTERN.matcher(fileName); if (fileMatcher.matches()) { return fileMatcher.group(1); } else { throw new HoodieException("Unexpected archival file name: " + fileName); } } /** * Parse the maximum instant time from the file name. */ public static String getMaxInstantTime(String fileName) { Matcher fileMatcher = ARCHIVE_FILE_PATTERN.matcher(fileName); if (fileMatcher.matches()) { return fileMatcher.group(2); } else { throw new HoodieException("Unexpected archival file name: " + fileName); } } /** * Returns whether a file belongs to the specified layer {@code layer} within the LSM layout. */ public static boolean isFileFromLayer(String fileName, int layer) { return getFileLayer(fileName) == layer; } /** * Returns a path filter for the manifest files. */ public static StoragePathFilter getManifestFilePathFilter() { return path -> path.getName().startsWith(MANIFEST_FILE_PREFIX) && !path.getName().endsWith(TEMP_FILE_SUFFIX); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy