All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.gobblin.writer.ParquetDataWriterBuilder Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.gobblin.writer;

import java.io.IOException;
import java.util.Optional;

import org.apache.gobblin.configuration.State;
import org.apache.gobblin.util.ForkOperatorUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;

import com.google.common.base.Preconditions;
import com.google.common.base.Strings;

import parquet.column.ParquetProperties;
import parquet.example.data.Group;
import parquet.hadoop.ParquetWriter;
import parquet.hadoop.example.GroupWriteSupport;
import parquet.hadoop.metadata.CompressionCodecName;
import parquet.schema.MessageType;

import static org.apache.gobblin.configuration.ConfigurationKeys.LOCAL_FS_URI;
import static org.apache.gobblin.configuration.ConfigurationKeys.WRITER_CODEC_TYPE;
import static org.apache.gobblin.configuration.ConfigurationKeys.WRITER_FILE_SYSTEM_URI;
import static org.apache.gobblin.configuration.ConfigurationKeys.WRITER_PREFIX;
import static parquet.hadoop.ParquetWriter.DEFAULT_BLOCK_SIZE;
import static parquet.hadoop.ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED;
import static parquet.hadoop.ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED;
import static parquet.hadoop.ParquetWriter.DEFAULT_PAGE_SIZE;


public class ParquetDataWriterBuilder extends FsDataWriterBuilder {
  public static final String WRITER_PARQUET_PAGE_SIZE = WRITER_PREFIX + ".parquet.pageSize";
  public static final String WRITER_PARQUET_DICTIONARY_PAGE_SIZE = WRITER_PREFIX + ".parquet.dictionaryPageSize";
  public static final String WRITER_PARQUET_DICTIONARY = WRITER_PREFIX + ".parquet.dictionary";
  public static final String WRITER_PARQUET_VALIDATE = WRITER_PREFIX + ".parquet.validate";
  public static final String WRITER_PARQUET_VERSION = WRITER_PREFIX + ".parquet.version";
  public static final String DEFAULT_PARQUET_WRITER = "v1";

  @Override
  public DataWriter build()
      throws IOException {
    Preconditions.checkNotNull(this.destination);
    Preconditions.checkArgument(!Strings.isNullOrEmpty(this.writerId));
    Preconditions.checkNotNull(this.schema);
    Preconditions.checkArgument(this.format == WriterOutputFormat.PARQUET);

    switch (this.destination.getType()) {
      case HDFS:
        return new ParquetHdfsDataWriter(this, this.destination.getProperties());
      default:
        throw new RuntimeException("Unknown destination type: " + this.destination.getType());
    }
  }

  /**
   * Build a {@link ParquetWriter} for given file path with a block size.
   * @param blockSize
   * @param stagingFile
   * @return
   * @throws IOException
   */
  public ParquetWriter getWriter(int blockSize, Path stagingFile)
      throws IOException {
    State state = this.destination.getProperties();
    int pageSize = state.getPropAsInt(getProperty(WRITER_PARQUET_PAGE_SIZE), DEFAULT_PAGE_SIZE);
    int dictPageSize = state.getPropAsInt(getProperty(WRITER_PARQUET_DICTIONARY_PAGE_SIZE), DEFAULT_BLOCK_SIZE);
    boolean enableDictionary =
        state.getPropAsBoolean(getProperty(WRITER_PARQUET_DICTIONARY), DEFAULT_IS_DICTIONARY_ENABLED);
    boolean validate = state.getPropAsBoolean(getProperty(WRITER_PARQUET_VALIDATE), DEFAULT_IS_VALIDATING_ENABLED);
    String rootURI = state.getProp(WRITER_FILE_SYSTEM_URI, LOCAL_FS_URI);
    Path absoluteStagingFile = new Path(rootURI, stagingFile);
    CompressionCodecName codec = getCodecFromConfig();
    GroupWriteSupport support = new GroupWriteSupport();
    Configuration conf = new Configuration();
    GroupWriteSupport.setSchema(this.schema, conf);
    ParquetProperties.WriterVersion writerVersion = getWriterVersion();
    return new ParquetWriter<>(absoluteStagingFile, support, codec, blockSize, pageSize, dictPageSize, enableDictionary,
        validate, writerVersion, conf);
  }

  private ParquetProperties.WriterVersion getWriterVersion() {
    return ParquetProperties.WriterVersion.fromString(
        this.destination.getProperties().getProp(getProperty(WRITER_PARQUET_VERSION), DEFAULT_PARQUET_WRITER));
  }

  private CompressionCodecName getCodecFromConfig() {
    State state = this.destination.getProperties();
    String codecValue = Optional.ofNullable(state.getProp(getProperty(WRITER_CODEC_TYPE)))
        .orElse(CompressionCodecName.SNAPPY.toString());
    return CompressionCodecName.valueOf(codecValue.toUpperCase());
  }

  private String getProperty(String key) {
    return ForkOperatorUtils.getPropertyNameForBranch(key, this.getBranches(), this.getBranch());
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy