All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.kyligence.kap.clickhouse.job.LoadInfo Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.kyligence.kap.clickhouse.job;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import java.util.stream.Collectors;

import org.apache.kylin.common.util.RandomUtil;
import org.apache.kylin.metadata.cube.model.LayoutEntity;
import org.apache.kylin.metadata.cube.model.NDataSegment;
import org.apache.kylin.metadata.model.NDataModel;

import org.apache.kylin.guava30.shaded.common.base.Preconditions;

import io.kyligence.kap.secondstorage.metadata.PartitionType;
import io.kyligence.kap.secondstorage.metadata.SegmentFileStatus;
import io.kyligence.kap.secondstorage.metadata.TableData;
import io.kyligence.kap.secondstorage.metadata.TableEntity;
import io.kyligence.kap.secondstorage.metadata.TableFlow;
import io.kyligence.kap.secondstorage.metadata.TablePartition;
import lombok.Getter;
import lombok.val;

public class LoadInfo {
    final NDataModel model;
    final NDataSegment segment;
    final String segmentId; // it is required for updating meta after load
    String oldSegmentId;
    final String[] nodeNames;
    final LayoutEntity layout;
    final List> shardFiles;
    final TableFlow tableFlow;
    @Getter
    private final TableEntity tableEntity;

    private String targetDatabase;
    private String targetTable;

    List containsOldSegmentTableData = new ArrayList<>(10);

    @SuppressWarnings("unchecked")
    private static  List newFixedSizeList(int size) {
        return (List) Arrays.asList(new Object[size]);
    }

    private LoadInfo(NDataModel model, NDataSegment segment, LayoutEntity layout, String[] nodeNames,
            TableFlow tableFlow, TableEntity tableEntity) {
        this(model, segment, null, layout, nodeNames, tableFlow, tableEntity);
    }

    private LoadInfo(NDataModel model, NDataSegment segment, String oldSegmentId, LayoutEntity layout,
            String[] nodeNames, TableFlow tableFlow, TableEntity tableEntity) {
        this.model = model;
        this.segment = segment;
        final int shardNumber = nodeNames.length;
        this.nodeNames = nodeNames;
        this.segmentId = segment.getId();
        this.layout = layout;
        this.shardFiles = newFixedSizeList(shardNumber);
        this.oldSegmentId = oldSegmentId;
        for (int i = 0; i < shardNumber; ++i) {
            this.shardFiles.set(i, new ArrayList<>(100));
        }
        this.tableFlow = tableFlow;
        this.tableEntity = tableEntity;
    }

    /**
     * ClickHouse doesn't support the separation of storage and compute, so it's hard to scale horizontally.
     * It results in the long term use of a fixed number of shards. We have two cases:
     * 
    *
  • Full Load
  • *
  • Incremental Load
  • *
* The problem here is how to distribute files across multiple shards evenly in incremental load? Consider the * case where table index building always generates 10 parquet files every day, and unfortunately, we only have 3 * shards. If we always distribute files from index 0, then shard 0 will have 10 more files than the other two after * ten days. i.e. *
    *
  • shard 0: 40 *
  • shard 1: 30 *
  • shard 2: 40 *
* * TODO: Incremental Load * TODO: Use a simple way to avoid this issue -- randomly choose the start shard each time we distribute loads. * TODO: fault-tolerant for randomly choosing the start shard? * */ public static LoadInfo distribute(String[] nodeNames, NDataModel model, NDataSegment segment, FileProvider provider, LayoutEntity layout, TableFlow tableFlow, TableEntity tableEntity) { int shardNum = nodeNames.length; final LoadInfo info = new LoadInfo(model, segment, layout, nodeNames, tableFlow, tableEntity); val it = provider.getAllFilePaths().iterator(); int index = 0; while (it.hasNext()) { FileStatus fileStatus = it.next(); info.addShardFile(index, fileStatus.getPath(), fileStatus.getLen()); index = ++index % shardNum; } return info; } public LoadInfo setTargetDatabase(String targetDatabase) { this.targetDatabase = targetDatabase; return this; } public LoadInfo setTargetTable(String targetTable) { this.targetTable = targetTable; return this; } public String getTargetTable() { return this.targetTable; } public LoadInfo setOldSegmentId(String oldSegmentId) { this.oldSegmentId = oldSegmentId; this.tableFlow.getTableDataList().forEach(tableData -> { if (tableData.getAllSegments().contains(oldSegmentId)) { containsOldSegmentTableData.add(tableData); } }); return this; } private void addShardFile(int shardIndex, String filePath, long fileLen) { Preconditions.checkArgument(shardIndex < shardFiles.size()); shardFiles.get(shardIndex).add(SegmentFileStatus.builder().setLen(fileLen).setPath(filePath).build()); } public LayoutEntity getLayout() { return layout; } public List> getShardFiles() { return shardFiles.stream() .map(files -> files.stream().map(SegmentFileStatus::getPath).collect(Collectors.toList())) .collect(Collectors.toList()); } public String getSegmentId() { return segmentId; } public String[] getNodeNames() { return nodeNames; } // meta update public TablePartition createMetaInfo() { Map> nodeFileMap = new HashMap<>(); ListIterator it = Arrays.asList(nodeNames).listIterator(); while (it.hasNext()) { int idx = it.nextIndex(); nodeFileMap.put(it.next(), shardFiles.get(idx)); } val metric = new ClickHouseTableStorageMetric(Arrays.asList(this.nodeNames)); metric.collect(false); Preconditions.checkNotNull(targetDatabase); Preconditions.checkNotNull(targetTable); String dateFormat = null; if (model.isIncrementBuildOnExpertMode()) { dateFormat = model.getPartitionDesc().getPartitionDateFormat(); } Map sizeInNode = metric.getByPartitions(targetDatabase, targetTable, segment.getSegRange(), dateFormat); return TablePartition.builder().setSegmentId(segmentId).setShardNodes(Arrays.asList(nodeNames)) .setId(RandomUtil.randomUUIDStr()).setNodeFileMap(nodeFileMap).setSizeInNode(sizeInNode) .setSecondaryIndexColumns(tableEntity.getSecondaryIndexColumns()).build(); } public void upsertTableData(TableFlow copied, String database, String table, PartitionType partitionType) { copied.upsertTableData(layout, tableData -> { Preconditions.checkArgument(tableData.getPartitionType() == partitionType); if (tableData.getLayoutID() != layout.getId()) { return; } if (oldSegmentId != null) { tableData.removePartitions(tablePartition -> tablePartition.getSegmentId().equals(oldSegmentId)); } tableData.addPartition(createMetaInfo()); }, () -> TableData.builder().setPartitionType(partitionType).setLayoutEntity(getLayout()).setDatabase(database) .setTable(table).build()); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy