All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.iceberg.spark.procedures.CreateChangelogViewProcedure Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.iceberg.spark.procedures;

import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.function.Predicate;
import org.apache.iceberg.MetadataColumns;
import org.apache.iceberg.Table;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.relocated.com.google.common.collect.Sets;
import org.apache.iceberg.spark.ChangelogIterator;
import org.apache.iceberg.spark.source.SparkChangelogTable;
import org.apache.spark.api.java.function.MapPartitionsFunction;
import org.apache.spark.sql.Column;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.catalyst.encoders.RowEncoder;
import org.apache.spark.sql.connector.catalog.Identifier;
import org.apache.spark.sql.connector.catalog.TableCatalog;
import org.apache.spark.sql.connector.iceberg.catalog.ProcedureParameter;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import org.apache.spark.unsafe.types.UTF8String;

/**
 * A procedure that creates a view for changed rows.
 *
 * 

The procedure always removes the carry-over rows. Please query {@link SparkChangelogTable} * instead when carry-over rows are required. * *

The procedure doesn't compute the pre/post update images by default. If you want to compute * them, you can set "compute_updates" to be true in the options. * *

Carry-over rows are the result of a removal and insertion of the same row within an operation * because of the copy-on-write mechanism. For example, given a file which contains row1 (id=1, * data='a') and row2 (id=2, data='b'). A copy-on-write delete of row2 would require erasing this * file and preserving row1 in a new file. The changelog table would report this as (id=1, data='a', * op='DELETE') and (id=1, data='a', op='INSERT'), despite it not being an actual change to the * table. The procedure finds the carry-over rows and removes them from the result. * *

Pre/post update images are converted from a pair of a delete row and an insert row. Identifier * columns are used for determining whether an insert and a delete record refer to the same row. If * the two records share the same values for the identity columns they are considered to be before * and after states of the same row. You can either set identifier fields in the table schema or * input them as the procedure parameters. Here is an example of pre/post update images with an * identifier column(id). A pair of a delete row and an insert row with the same id: * *

    *
  • (id=1, data='a', op='DELETE') *
  • (id=1, data='b', op='INSERT') *
* *

will be marked as pre/post update images: * *

    *
  • (id=1, data='a', op='UPDATE_BEFORE') *
  • (id=1, data='b', op='UPDATE_AFTER') *
*/ public class CreateChangelogViewProcedure extends BaseProcedure { private static final ProcedureParameter TABLE_PARAM = ProcedureParameter.required("table", DataTypes.StringType); private static final ProcedureParameter CHANGELOG_VIEW_PARAM = ProcedureParameter.optional("changelog_view", DataTypes.StringType); private static final ProcedureParameter OPTIONS_PARAM = ProcedureParameter.optional("options", STRING_MAP); private static final ProcedureParameter COMPUTE_UPDATES_PARAM = ProcedureParameter.optional("compute_updates", DataTypes.BooleanType); private static final ProcedureParameter IDENTIFIER_COLUMNS_PARAM = ProcedureParameter.optional("identifier_columns", STRING_ARRAY); private static final ProcedureParameter NET_CHANGES = ProcedureParameter.optional("net_changes", DataTypes.BooleanType); private static final ProcedureParameter[] PARAMETERS = new ProcedureParameter[] { TABLE_PARAM, CHANGELOG_VIEW_PARAM, OPTIONS_PARAM, COMPUTE_UPDATES_PARAM, IDENTIFIER_COLUMNS_PARAM, NET_CHANGES, }; private static final StructType OUTPUT_TYPE = new StructType( new StructField[] { new StructField("changelog_view", DataTypes.StringType, false, Metadata.empty()) }); public static SparkProcedures.ProcedureBuilder builder() { return new BaseProcedure.Builder() { @Override protected CreateChangelogViewProcedure doBuild() { return new CreateChangelogViewProcedure(tableCatalog()); } }; } private CreateChangelogViewProcedure(TableCatalog tableCatalog) { super(tableCatalog); } @Override public ProcedureParameter[] parameters() { return PARAMETERS; } @Override public StructType outputType() { return OUTPUT_TYPE; } @Override public InternalRow[] call(InternalRow args) { ProcedureInput input = new ProcedureInput(spark(), tableCatalog(), PARAMETERS, args); Identifier tableIdent = input.ident(TABLE_PARAM); // load insert and deletes from the changelog table Identifier changelogTableIdent = changelogTableIdent(tableIdent); Dataset df = loadRows(changelogTableIdent, options(input)); boolean netChanges = input.asBoolean(NET_CHANGES, false); if (shouldComputeUpdateImages(input)) { Preconditions.checkArgument(!netChanges, "Not support net changes with update images"); df = computeUpdateImages(identifierColumns(input, tableIdent), df); } else { df = removeCarryoverRows(df, netChanges); } String viewName = viewName(input, tableIdent.name()); df.createOrReplaceTempView(viewName); return toOutputRows(viewName); } private Dataset computeUpdateImages(String[] identifierColumns, Dataset df) { Preconditions.checkArgument( identifierColumns.length > 0, "Cannot compute the update images because identifier columns are not set"); Column[] repartitionSpec = new Column[identifierColumns.length + 1]; for (int i = 0; i < identifierColumns.length; i++) { repartitionSpec[i] = df.col(identifierColumns[i]); } repartitionSpec[repartitionSpec.length - 1] = df.col(MetadataColumns.CHANGE_ORDINAL.name()); return applyChangelogIterator(df, repartitionSpec); } private boolean shouldComputeUpdateImages(ProcedureInput input) { // If the identifier columns are set, we compute pre/post update images by default. boolean defaultValue = input.isProvided(IDENTIFIER_COLUMNS_PARAM); return input.asBoolean(COMPUTE_UPDATES_PARAM, defaultValue); } private Dataset removeCarryoverRows(Dataset df, boolean netChanges) { Predicate columnsToKeep; if (netChanges) { Set metadataColumn = Sets.newHashSet( MetadataColumns.CHANGE_TYPE.name(), MetadataColumns.CHANGE_ORDINAL.name(), MetadataColumns.COMMIT_SNAPSHOT_ID.name()); columnsToKeep = column -> !metadataColumn.contains(column); } else { columnsToKeep = column -> !column.equals(MetadataColumns.CHANGE_TYPE.name()); } Column[] repartitionSpec = Arrays.stream(df.columns()).filter(columnsToKeep).map(df::col).toArray(Column[]::new); return applyCarryoverRemoveIterator(df, repartitionSpec, netChanges); } private String[] identifierColumns(ProcedureInput input, Identifier tableIdent) { if (input.isProvided(IDENTIFIER_COLUMNS_PARAM)) { return input.asStringArray(IDENTIFIER_COLUMNS_PARAM); } else { Table table = loadSparkTable(tableIdent).table(); return table.schema().identifierFieldNames().toArray(new String[0]); } } private Identifier changelogTableIdent(Identifier tableIdent) { List namespace = Lists.newArrayList(); namespace.addAll(Arrays.asList(tableIdent.namespace())); namespace.add(tableIdent.name()); return Identifier.of(namespace.toArray(new String[0]), SparkChangelogTable.TABLE_NAME); } private Map options(ProcedureInput input) { return input.asStringMap(OPTIONS_PARAM, ImmutableMap.of()); } private String viewName(ProcedureInput input, String tableName) { String defaultValue = String.format("`%s_changes`", tableName); return input.asString(CHANGELOG_VIEW_PARAM, defaultValue); } private Dataset applyChangelogIterator(Dataset df, Column[] repartitionSpec) { Column[] sortSpec = sortSpec(df, repartitionSpec, false); StructType schema = df.schema(); String[] identifierFields = Arrays.stream(repartitionSpec).map(Column::toString).toArray(String[]::new); return df.repartition(repartitionSpec) .sortWithinPartitions(sortSpec) .mapPartitions( (MapPartitionsFunction) rowIterator -> ChangelogIterator.computeUpdates(rowIterator, schema, identifierFields), RowEncoder.apply(schema)); } private Dataset applyCarryoverRemoveIterator( Dataset df, Column[] repartitionSpec, boolean netChanges) { Column[] sortSpec = sortSpec(df, repartitionSpec, netChanges); StructType schema = df.schema(); return df.repartition(repartitionSpec) .sortWithinPartitions(sortSpec) .mapPartitions( (MapPartitionsFunction) rowIterator -> netChanges ? ChangelogIterator.removeNetCarryovers(rowIterator, schema) : ChangelogIterator.removeCarryovers(rowIterator, schema), RowEncoder.apply(schema)); } private static Column[] sortSpec(Dataset df, Column[] repartitionSpec, boolean netChanges) { Column changeType = df.col(MetadataColumns.CHANGE_TYPE.name()); Column changeOrdinal = df.col(MetadataColumns.CHANGE_ORDINAL.name()); Column[] extraColumns = netChanges ? new Column[] {changeOrdinal, changeType} : new Column[] {changeType}; Column[] sortSpec = new Column[repartitionSpec.length + extraColumns.length]; System.arraycopy(repartitionSpec, 0, sortSpec, 0, repartitionSpec.length); System.arraycopy(extraColumns, 0, sortSpec, repartitionSpec.length, extraColumns.length); return sortSpec; } private InternalRow[] toOutputRows(String viewName) { InternalRow row = newInternalRow(UTF8String.fromString(viewName)); return new InternalRow[] {row}; } @Override public String description() { return "CreateChangelogViewProcedure"; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy