All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.api.java.io.SplitDataProperties Maven / Gradle / Ivy

There is a newer version: 1.20.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.api.java.io;

import org.apache.flink.annotation.PublicEvolving;
import org.apache.flink.api.common.InvalidProgramException;
import org.apache.flink.api.common.functions.Partitioner;
import org.apache.flink.api.common.operators.GenericDataSourceBase;
import org.apache.flink.api.common.operators.Keys;
import org.apache.flink.api.common.operators.Order;
import org.apache.flink.api.common.operators.Ordering;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.java.operators.DataSource;

import java.util.Arrays;

/**
 * SplitDataProperties define data properties on {@link org.apache.flink.core.io.InputSplit}
 * generated by the {@link org.apache.flink.api.common.io.InputFormat} of a {@link DataSource}.
 *
 * 

InputSplits are units of input which are distributed among and assigned to parallel data source subtasks. * SplitDataProperties can define that the elements which are generated by the associated InputFormat * are *

    *
  • Partitioned on one or more fields across InputSplits, i.e., all elements with the same * (combination of) key(s) are located in the same input split.
  • *
  • Grouped on one or more fields within an InputSplit, i.e., all elements of an input split * that have the same (combination of) key(s) are emitted in a single sequence one after the other.
  • *
  • Ordered on one or more fields within an InputSplit, i.e., all elements within an input split * are in the defined order.
  • *
* *

IMPORTANT: SplitDataProperties can improve the execution of a program because certain * data reorganization steps such as shuffling or sorting can be avoided. * HOWEVER, if SplitDataProperties are not correctly defined, the result of the program might be wrong! * * @param The type of the DataSource on which the SplitDataProperties are defined. * * @see org.apache.flink.core.io.InputSplit * @see org.apache.flink.api.common.io.InputFormat * @see org.apache.flink.api.java.operators.DataSource */ @PublicEvolving public class SplitDataProperties implements GenericDataSourceBase.SplitDataProperties { private TypeInformation type; private int[] splitPartitionKeys; private Partitioner splitPartitioner; private int[] splitGroupKeys; private Ordering splitOrdering; /** * Creates SplitDataProperties for the given data types. * * @param type The data type of the SplitDataProperties. */ public SplitDataProperties(TypeInformation type) { this.type = type; } /** * Creates SplitDataProperties for the given data types. * * @param source The DataSource for which the SplitDataProperties are created. */ public SplitDataProperties(DataSource source) { this.type = source.getType(); } /** * Defines that data is partitioned across input splits on the fields defined by field positions. * All records sharing the same key (combination) must be contained in a single input split. * *

* IMPORTANT: Providing wrong information with SplitDataProperties can cause wrong results! * * * @param partitionFields The field positions of the partitioning keys. * @return This SplitDataProperties object. */ public SplitDataProperties splitsPartitionedBy(int... partitionFields) { return this.splitsPartitionedBy(null, partitionFields); } /** * Defines that data is partitioned using a specific partitioning method * across input splits on the fields defined by field positions. * All records sharing the same key (combination) must be contained in a single input split. * *

* IMPORTANT: Providing wrong information with SplitDataProperties can cause wrong results! * * * @param partitionMethodId An ID for the method that was used to partition the data across splits. * @param partitionFields The field positions of the partitioning keys. * @return This SplitDataProperties object. */ public SplitDataProperties splitsPartitionedBy(String partitionMethodId, int... partitionFields) { if (partitionFields == null) { throw new InvalidProgramException("PartitionFields may not be null."); } else if (partitionFields.length == 0) { throw new InvalidProgramException("PartitionFields may not be empty."); } this.splitPartitionKeys = getAllFlatKeys(partitionFields); if (partitionMethodId != null) { this.splitPartitioner = new SourcePartitionerMarker<>(partitionMethodId); } else { this.splitPartitioner = null; } return this; } /** * Defines that data is partitioned across input splits on the fields defined by field expressions. * Multiple field expressions must be separated by the semicolon ';' character. * All records sharing the same key (combination) must be contained in a single input split. * *

* IMPORTANT: Providing wrong information with SplitDataProperties can cause wrong results! * * * @param partitionFields The field expressions of the partitioning keys. * @return This SplitDataProperties object. */ public SplitDataProperties splitsPartitionedBy(String partitionFields) { return this.splitsPartitionedBy(null, partitionFields); } /** * Defines that data is partitioned using an identifiable method * across input splits on the fields defined by field expressions. * Multiple field expressions must be separated by the semicolon ';' character. * All records sharing the same key (combination) must be contained in a single input split. * *

* IMPORTANT: Providing wrong information with SplitDataProperties can cause wrong results! * * * @param partitionMethodId An ID for the method that was used to partition the data across splits. * @param partitionFields The field expressions of the partitioning keys. * @return This SplitDataProperties object. */ public SplitDataProperties splitsPartitionedBy(String partitionMethodId, String partitionFields) { if (partitionFields == null) { throw new InvalidProgramException("PartitionFields may not be null."); } String[] partitionKeysA = partitionFields.split(";"); if (partitionKeysA.length == 0) { throw new InvalidProgramException("PartitionFields may not be empty."); } this.splitPartitionKeys = getAllFlatKeys(partitionKeysA); if (partitionMethodId != null) { this.splitPartitioner = new SourcePartitionerMarker<>(partitionMethodId); } else { this.splitPartitioner = null; } return this; } /** * Defines that the data within an input split is grouped on the fields defined by the field positions. * All records sharing the same key (combination) must be subsequently emitted by the input * format for each input split. * *

* IMPORTANT: Providing wrong information with SplitDataProperties can cause wrong results! * * * @param groupFields The field positions of the grouping keys. * @return This SplitDataProperties object. */ public SplitDataProperties splitsGroupedBy(int... groupFields) { if (groupFields == null) { throw new InvalidProgramException("GroupFields may not be null."); } else if (groupFields.length == 0) { throw new InvalidProgramException("GroupFields may not be empty."); } if (this.splitOrdering != null) { throw new InvalidProgramException("DataSource may either be grouped or sorted."); } this.splitGroupKeys = getAllFlatKeys(groupFields); return this; } /** * Defines that the data within an input split is grouped on the fields defined by the field expressions. * Multiple field expressions must be separated by the semicolon ';' character. * All records sharing the same key (combination) must be subsequently emitted by the input * format for each input split. * *

* IMPORTANT: Providing wrong information with SplitDataProperties can cause wrong results! * * * @param groupFields The field expressions of the grouping keys. * @return This SplitDataProperties object. */ public SplitDataProperties splitsGroupedBy(String groupFields) { if (groupFields == null) { throw new InvalidProgramException("GroupFields may not be null."); } String[] groupKeysA = groupFields.split(";"); if (groupKeysA.length == 0) { throw new InvalidProgramException("GroupFields may not be empty."); } if (this.splitOrdering != null) { throw new InvalidProgramException("DataSource may either be grouped or sorted."); } this.splitGroupKeys = getAllFlatKeys(groupKeysA); return this; } /** * Defines that the data within an input split is sorted on the fields defined by the field positions * in the specified orders. * All records of an input split must be emitted by the input format in the defined order. * *

* IMPORTANT: Providing wrong information with SplitDataProperties can cause wrong results! * * * @param orderFields The field positions of the grouping keys. * @param orders The orders of the fields. * @return This SplitDataProperties object. */ public SplitDataProperties splitsOrderedBy(int[] orderFields, Order[] orders) { if (orderFields == null || orders == null) { throw new InvalidProgramException("OrderFields or Orders may not be null."); } else if (orderFields.length == 0) { throw new InvalidProgramException("OrderFields may not be empty."); } else if (orders.length == 0) { throw new InvalidProgramException("Orders may not be empty"); } else if (orderFields.length != orders.length) { throw new InvalidProgramException("Number of OrderFields and Orders must match."); } if (this.splitGroupKeys != null) { throw new InvalidProgramException("DataSource may either be grouped or sorted."); } this.splitOrdering = new Ordering(); for (int i = 0; i < orderFields.length; i++) { int pos = orderFields[i]; int[] flatKeys = this.getAllFlatKeys(new int[]{pos}); for (int key : flatKeys) { // check for duplicates for (int okey : splitOrdering.getFieldPositions()) { if (key == okey) { throw new InvalidProgramException("Duplicate field in the field expression " + pos); } } // append key this.splitOrdering.appendOrdering(key, null, orders[i]); } } return this; } /** * Defines that the data within an input split is sorted on the fields defined by the field expressions * in the specified orders. Multiple field expressions must be separated by the semicolon ';' character. * All records of an input split must be emitted by the input format in the defined order. * *

* IMPORTANT: Providing wrong information with SplitDataProperties can cause wrong results! * * * @param orderFields The field expressions of the grouping key. * @param orders The orders of the fields. * @return This SplitDataProperties object. */ public SplitDataProperties splitsOrderedBy(String orderFields, Order[] orders) { if (orderFields == null || orders == null) { throw new InvalidProgramException("OrderFields or Orders may not be null."); } String[] orderKeysA = orderFields.split(";"); if (orderKeysA.length == 0) { throw new InvalidProgramException("OrderFields may not be empty."); } else if (orders.length == 0) { throw new InvalidProgramException("Orders may not be empty"); } else if (orderKeysA.length != orders.length) { throw new InvalidProgramException("Number of OrderFields and Orders must match."); } if (this.splitGroupKeys != null) { throw new InvalidProgramException("DataSource may either be grouped or sorted."); } this.splitOrdering = new Ordering(); for (int i = 0; i < orderKeysA.length; i++) { String keyExp = orderKeysA[i]; Keys.ExpressionKeys ek = new Keys.ExpressionKeys<>(keyExp, this.type); int[] flatKeys = ek.computeLogicalKeyPositions(); for (int key : flatKeys) { // check for duplicates for (int okey : splitOrdering.getFieldPositions()) { if (key == okey) { throw new InvalidProgramException("Duplicate field in field expression " + keyExp); } } // append key this.splitOrdering.appendOrdering(key, null, orders[i]); } } return this; } public int[] getSplitPartitionKeys() { return this.splitPartitionKeys; } public Partitioner getSplitPartitioner() { return this.splitPartitioner; } public int[] getSplitGroupKeys() { return this.splitGroupKeys; } public Ordering getSplitOrder() { return this.splitOrdering; } /////////////////////// FLAT FIELD EXTRACTION METHODS private int[] getAllFlatKeys(String[] fieldExpressions) { int[] allKeys = null; for (String keyExp : fieldExpressions) { Keys.ExpressionKeys ek = new Keys.ExpressionKeys<>(keyExp, this.type); int[] flatKeys = ek.computeLogicalKeyPositions(); if (allKeys == null) { allKeys = flatKeys; } else { // check for duplicates for (int key1 : flatKeys) { for (int key2 : allKeys) { if (key1 == key2) { throw new InvalidProgramException("Duplicate fields in field expression " + keyExp); } } } // append flat keys int oldLength = allKeys.length; int newLength = oldLength + flatKeys.length; allKeys = Arrays.copyOf(allKeys, newLength); System.arraycopy(flatKeys, 0, allKeys, oldLength, flatKeys.length); } } return allKeys; } private int[] getAllFlatKeys(int[] fieldPositions) { Keys.ExpressionKeys ek = new Keys.ExpressionKeys<>(fieldPositions, this.type); return ek.computeLogicalKeyPositions(); } /** * A custom partitioner to mark compatible split partitionings. * * @param The type of the partitioned data. */ public static class SourcePartitionerMarker implements Partitioner { private static final long serialVersionUID = -8554223652384442571L; String partitionMarker; public SourcePartitionerMarker(String partitionMarker) { this.partitionMarker = partitionMarker; } @Override public int partition(T key, int numPartitions) { throw new UnsupportedOperationException("The SourcePartitionerMarker is only used as a marker for compatible partitioning. " + "It must not be invoked."); } @Override public boolean equals(Object o) { if (o instanceof SourcePartitionerMarker) { return this.partitionMarker.equals(((SourcePartitionerMarker) o).partitionMarker); } else { return false; } } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy