com.spotify.parquet.tensorflow.TensorflowExampleParquetInputFormat Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of scio-parquet_2.12 Show documentation
Scio add-on for Parquet
The newest version!
/*
 * Copyright 2023 Spotify AB
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.spotify.parquet.tensorflow;

import org.apache.hadoop.mapreduce.Job;
import org.apache.parquet.hadoop.ParquetInputFormat;
import org.apache.parquet.hadoop.util.ContextUtil;
import org.tensorflow.metadata.v0.Schema;
import org.tensorflow.proto.example.Example;

public class TensorflowExampleParquetInputFormat extends ParquetInputFormat {

  /**
   * Set the subset of columns to read (projection pushdown). Specified as an tensorflow schema, the
   * requested projection is converted into a Parquet schema for Parquet column projection.
   *
   * This is useful if the full schema is large and you only want to read a few columns, since it
   * saves time by not reading unused columns.
   *
   * If a requested projection is set, then the tensorflow schema used for reading must be
   * compatible with the projection. Use {@link
   * #setExampleReadSchema(org.apache.hadoop.mapreduce.Job, org.tensorflow.metadata.v0.Schema)} to
   * set a read schema, if needed.
   *
   * @param job a job
   * @param requestedProjection the requested projection schema
   * @see #setExampleReadSchema(org.apache.hadoop.mapreduce.Job, org.tensorflow.metadata.v0.Schema)
   * @see TensorflowExampleParquetOutputFormat#setSchema(org.apache.hadoop.mapreduce.Job,
   *     org.tensorflow.metadata.v0.Schema)
   */
  public static void setRequestedProjection(Job job, Schema requestedProjection) {
    TensorflowExampleReadSupport.setRequestedProjection(
        ContextUtil.getConfiguration(job), requestedProjection);
  }

  /**
   * Override the tensorflow schema to use for reading. If not set, the tensorflow schema used for
   * writing is used.
   *
   * @param job a job
   * @param tfReadSchema the requested schema
   * @see #setRequestedProjection(org.apache.hadoop.mapreduce.Job,
   *     org.tensorflow.metadata.v0.Schema)
   * @see TensorflowExampleParquetOutputFormat#setSchema(org.apache.hadoop.mapreduce.Job,
   *     org.tensorflow.metadata.v0.Schema)
   */
  public static void setExampleReadSchema(Job job, Schema tfReadSchema) {
    TensorflowExampleReadSupport.setExampleReadSchema(
        ContextUtil.getConfiguration(job), tfReadSchema);
  }
}