org.apache.hudi.hadoop.HoodieParquetInputFormat Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.hadoop;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.hadoop.utils.HoodieHiveUtils;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.hadoop.avro.HoodieTimestampAwareParquetInputFormat;
import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
import org.apache.hadoop.hive.ql.io.parquet.read.ParquetRecordReaderWrapper;
import org.apache.hadoop.hive.ql.io.sarg.ConvertAstToSearchArg;
import org.apache.hadoop.hive.ql.plan.TableScanDesc;
import org.apache.hadoop.io.ArrayWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hudi.hadoop.utils.HoodieRealtimeInputFormatUtils;
import org.apache.parquet.hadoop.ParquetInputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.lang.reflect.Constructor;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
/**
* HoodieInputFormat which understands the Hoodie File Structure and filters files based on the Hoodie Mode. If paths
* that does not correspond to a hoodie table then they are passed in as is (as what FileInputFormat.listStatus()
* would do). The JobConf could have paths from multiple Hoodie/Non-Hoodie tables
*/
@UseRecordReaderFromInputFormat
@UseFileSplitsFromInputFormat
public class HoodieParquetInputFormat extends HoodieParquetInputFormatBase {
private static final Logger LOG = LoggerFactory.getLogger(HoodieParquetInputFormat.class);
private boolean supportAvroRead = false;
public HoodieParquetInputFormat() {
super(new HoodieCopyOnWriteTableInputFormat());
initAvroInputFormat();
}
protected HoodieParquetInputFormat(HoodieCopyOnWriteTableInputFormat delegate) {
super(delegate);
initAvroInputFormat();
}
/**
* Spark2 use `parquet.hadoopParquetInputFormat` in `com.twitter:parquet-hadoop-bundle`.
* So that we need to distinguish the constructions of classes with
* `parquet.hadoopParquetInputFormat` or `org.apache.parquet.hadoop.ParquetInputFormat`.
* If we use `org.apache.parquet:parquet-hadoop`, we can use `HudiAvroParquetInputFormat`
* in Hive or Spark3 to get timestamp with correct type.
*/
private void initAvroInputFormat() {
try {
Constructor[] constructors = ParquetRecordReaderWrapper.class.getConstructors();
if (Arrays.stream(constructors)
.anyMatch(c -> c.getParameterCount() > 0 && c.getParameterTypes()[0]
.getName().equals(ParquetInputFormat.class.getName()))) {
supportAvroRead = true;
}
} catch (SecurityException e) {
throw new HoodieException("Failed to check if support avro reader: " + e.getMessage(), e);
}
}
@Override
public RecordReader getRecordReader(final InputSplit split, final JobConf job,
final Reporter reporter) throws IOException {
// TODO enable automatic predicate pushdown after fixing issues
// FileSplit fileSplit = (FileSplit) split;
// HoodieTableMetadata metadata = getTableMetadata(fileSplit.getPath().getParent());
// String tableName = metadata.getTableName();
// String mode = HoodieHiveUtil.readMode(job, tableName);
// if (HoodieHiveUtil.INCREMENTAL_SCAN_MODE.equals(mode)) {
// FilterPredicate predicate = constructHoodiePredicate(job, tableName, split);
// LOG.info("Setting parquet predicate push down as " + predicate);
// ParquetInputFormat.setFilterPredicate(job, predicate);
// clearOutExistingPredicate(job);
// }
if (split instanceof BootstrapBaseFileSplit) {
return createBootstrappingRecordReader(split, job, reporter);
}
// adapt schema evolution
new SchemaEvolutionContext(split, job).doEvolutionForParquetFormat();
if (LOG.isDebugEnabled()) {
LOG.debug("EMPLOYING DEFAULT RECORD READER - " + split);
}
HoodieRealtimeInputFormatUtils.addProjectionField(job, job.get(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, "").split("/"));
return getRecordReaderInternal(split, job, reporter);
}
private RecordReader getRecordReaderInternal(InputSplit split,
JobConf job,
Reporter reporter) throws IOException {
try {
if (supportAvroRead && HoodieColumnProjectionUtils.supportTimestamp(job)) {
return new ParquetRecordReaderWrapper(new HoodieTimestampAwareParquetInputFormat(), split, job, reporter);
} else {
return super.getRecordReader(split, job, reporter);
}
} catch (final InterruptedException | IOException e) {
throw new RuntimeException("Cannot create a RecordReaderWrapper", e);
}
}
private RecordReader createBootstrappingRecordReader(InputSplit split,
JobConf job,
Reporter reporter) throws IOException {
BootstrapBaseFileSplit eSplit = (BootstrapBaseFileSplit) split;
String[] rawColNames = HoodieColumnProjectionUtils.getReadColumnNames(job);
List rawColIds = HoodieColumnProjectionUtils.getReadColumnIDs(job);
List> projectedColsWithIndex =
IntStream.range(0, rawColIds.size()).mapToObj(idx -> Pair.of(rawColIds.get(idx), rawColNames[idx]))
.collect(Collectors.toList());
List> hoodieColsProjected = projectedColsWithIndex.stream()
.filter(idxWithName -> HoodieRecord.HOODIE_META_COLUMNS.contains(idxWithName.getValue()))
.collect(Collectors.toList());
List> externalColsProjected = projectedColsWithIndex.stream()
.filter(idxWithName -> !HoodieRecord.HOODIE_META_COLUMNS.contains(idxWithName.getValue())
&& !HoodieHiveUtils.VIRTUAL_COLUMN_NAMES.contains(idxWithName.getValue()))
.collect(Collectors.toList());
// This always matches hive table description
List> colNameWithTypes = HoodieColumnProjectionUtils.getIOColumnNameAndTypes(job);
List> colNamesWithTypesForExternal = colNameWithTypes.stream()
.filter(p -> !HoodieRecord.HOODIE_META_COLUMNS.contains(p.getKey())).collect(Collectors.toList());
LOG.info("colNameWithTypes =" + colNameWithTypes + ", Num Entries =" + colNameWithTypes.size());
if (hoodieColsProjected.isEmpty()) {
return getRecordReaderInternal(eSplit.getBootstrapFileSplit(), job, reporter);
} else if (externalColsProjected.isEmpty()) {
return getRecordReaderInternal(split, job, reporter);
} else {
FileSplit rightSplit = eSplit.getBootstrapFileSplit();
// Hive PPD works at row-group level and only enabled when hive.optimize.index.filter=true;
// The above config is disabled by default. But when enabled, would cause misalignment between
// skeleton and bootstrap file. We will disable them specifically when query needs bootstrap and skeleton
// file to be stitched.
// This disables row-group filtering
JobConf jobConfCopy = new JobConf(job);
jobConfCopy.unset(TableScanDesc.FILTER_EXPR_CONF_STR);
jobConfCopy.unset(ConvertAstToSearchArg.SARG_PUSHDOWN);
LOG.info("Generating column stitching reader for " + eSplit.getPath() + " and " + rightSplit.getPath());
return new BootstrapColumnStichingRecordReader(getRecordReaderInternal(eSplit, jobConfCopy, reporter),
HoodieRecord.HOODIE_META_COLUMNS.size(),
getRecordReaderInternal(rightSplit, jobConfCopy, reporter),
colNamesWithTypesForExternal.size(),
true);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy