All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.utilities.sources.CsvDFSSource Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.utilities.sources;

import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.table.checkpoint.Checkpoint;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.utilities.schema.SchemaProvider;
import org.apache.hudi.utilities.sources.helpers.DFSPathSelector;

import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.DataFrameReader;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.avro.SchemaConverters;
import org.apache.spark.sql.types.StructType;

import java.util.Arrays;
import java.util.List;

import static org.apache.hudi.common.util.ConfigUtils.DELTA_STREAMER_CONFIG_PREFIX;
import static org.apache.hudi.common.util.ConfigUtils.STREAMER_CONFIG_PREFIX;

/**
 * Reads data from CSV files on DFS as the data source.
 * 

* Internally, we use Spark to read CSV files thus any limitation of Spark CSV also applies here * (e.g., limited support for nested schema). *

* You can set the CSV-specific configs in the format of hoodie.streamer.csv.* * that are Spark compatible to deal with CSV files in Hudi. The supported options are: *

* "sep", "encoding", "quote", "escape", "charToEscapeQuoteEscaping", "comment", * "header", "enforceSchema", "inferSchema", "samplingRatio", "ignoreLeadingWhiteSpace", * "ignoreTrailingWhiteSpace", "nullValue", "emptyValue", "nanValue", "positiveInf", * "negativeInf", "dateFormat", "timestampFormat", "maxColumns", "maxCharsPerColumn", * "mode", "columnNameOfCorruptRecord", "multiLine" *

* Detailed information of these CSV options can be found at: * https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/DataFrameReader.html#csv-scala.collection.Seq- *

* If the source Avro schema is provided through the {@link org.apache.hudi.utilities.schema.FilebasedSchemaProvider} * using "hoodie.streamer.schemaprovider.source.schema.file" config, the schema is * passed to the CSV reader without inferring the schema from the CSV file. */ public class CsvDFSSource extends RowSource { private static final long serialVersionUID = 1L; // CsvSource config prefix protected static final String CSV_SRC_CONFIG_PREFIX = STREAMER_CONFIG_PREFIX + "csv."; protected static final String OLD_CSV_SRC_CONFIG_PREFIX = DELTA_STREAMER_CONFIG_PREFIX + "csv."; // CSV-specific configurations to pass in from Hudi to Spark protected static final List CSV_CONFIG_KEYS = Arrays.asList( "sep", "encoding", "quote", "escape", "charToEscapeQuoteEscaping", "comment", "header", "enforceSchema", "inferSchema", "samplingRatio", "ignoreLeadingWhiteSpace", "ignoreTrailingWhiteSpace", "nullValue", "emptyValue", "nanValue", "positiveInf", "negativeInf", "dateFormat", "timestampFormat", "maxColumns", "maxCharsPerColumn", "mode", "columnNameOfCorruptRecord", "multiLine" ); private final transient DFSPathSelector pathSelector; private final StructType sourceSchema; public CsvDFSSource(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession, SchemaProvider schemaProvider) { super(props, sparkContext, sparkSession, schemaProvider); this.pathSelector = DFSPathSelector.createSourceSelector(props, sparkContext.hadoopConfiguration()); if (schemaProvider != null) { sourceSchema = (StructType) SchemaConverters.toSqlType(schemaProvider.getSourceSchema()) .dataType(); } else { sourceSchema = null; } } @Override protected Pair>, Checkpoint> fetchNextBatch(Option lastCheckpoint, long sourceLimit) { Pair, Checkpoint> selPathsWithMaxModificationTime = pathSelector.getNextFilePathsAndMaxModificationTime(sparkContext, lastCheckpoint, sourceLimit); return Pair.of(fromFiles( selPathsWithMaxModificationTime.getLeft()), selPathsWithMaxModificationTime.getRight()); } /** * Reads the CSV files and parsed the lines into {@link Dataset} of {@link Row}. * * @param pathStr The list of file paths, separated by ','. * @return {@link Dataset} of {@link Row} containing the records. */ private Option> fromFiles(Option pathStr) { if (pathStr.isPresent()) { DataFrameReader dataFrameReader = sparkSession.read().format("csv"); CSV_CONFIG_KEYS.forEach(optionKey -> { String configPropName = CSV_SRC_CONFIG_PREFIX + optionKey; String oldConfigPropName = OLD_CSV_SRC_CONFIG_PREFIX + optionKey; String value = props.getString(configPropName, props.getString(oldConfigPropName, null)); // Pass down the Hudi CSV configs to Spark DataFrameReader if (value != null) { dataFrameReader.option(optionKey, value); } }); if (sourceSchema != null) { // Source schema is specified, pass it to the reader dataFrameReader.schema(sourceSchema); } dataFrameReader.option("inferSchema", Boolean.toString(sourceSchema == null)); return Option.of(dataFrameReader.load(pathStr.get().split(","))); } else { return Option.empty(); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy