com.spotify.parquet.tensorflow.TensorflowExampleParquetInputFormat Maven / Gradle / Ivy
/*
* Copyright 2023 Spotify AB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.spotify.parquet.tensorflow;
import org.apache.hadoop.mapreduce.Job;
import org.apache.parquet.hadoop.ParquetInputFormat;
import org.apache.parquet.hadoop.util.ContextUtil;
import org.tensorflow.metadata.v0.Schema;
import org.tensorflow.proto.example.Example;
public class TensorflowExampleParquetInputFormat extends ParquetInputFormat {
/**
* Set the subset of columns to read (projection pushdown). Specified as an tensorflow schema, the
* requested projection is converted into a Parquet schema for Parquet column projection.
*
* This is useful if the full schema is large and you only want to read a few columns, since it
* saves time by not reading unused columns.
*
*
If a requested projection is set, then the tensorflow schema used for reading must be
* compatible with the projection. Use {@link
* #setExampleReadSchema(org.apache.hadoop.mapreduce.Job, org.tensorflow.metadata.v0.Schema)} to
* set a read schema, if needed.
*
* @param job a job
* @param requestedProjection the requested projection schema
* @see #setExampleReadSchema(org.apache.hadoop.mapreduce.Job, org.tensorflow.metadata.v0.Schema)
* @see TensorflowExampleParquetOutputFormat#setSchema(org.apache.hadoop.mapreduce.Job,
* org.tensorflow.metadata.v0.Schema)
*/
public static void setRequestedProjection(Job job, Schema requestedProjection) {
TensorflowExampleReadSupport.setRequestedProjection(
ContextUtil.getConfiguration(job), requestedProjection);
}
/**
* Override the tensorflow schema to use for reading. If not set, the tensorflow schema used for
* writing is used.
*
* @param job a job
* @param tfReadSchema the requested schema
* @see #setRequestedProjection(org.apache.hadoop.mapreduce.Job,
* org.tensorflow.metadata.v0.Schema)
* @see TensorflowExampleParquetOutputFormat#setSchema(org.apache.hadoop.mapreduce.Job,
* org.tensorflow.metadata.v0.Schema)
*/
public static void setExampleReadSchema(Job job, Schema tfReadSchema) {
TensorflowExampleReadSupport.setExampleReadSchema(
ContextUtil.getConfiguration(job), tfReadSchema);
}
}