All Downloads are FREE. Search and download functionalities are using the official Maven repository.

parquet.hadoop.thrift.AbstractThriftWriteSupport Maven / Gradle / Ivy

/**
 * Copyright 2012 Twitter, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package parquet.hadoop.thrift;

import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.thrift.TBase;

import com.twitter.elephantbird.pig.util.ThriftToPig;

import parquet.Log;
import parquet.hadoop.BadConfigurationException;
import parquet.hadoop.api.WriteSupport;
import parquet.io.ColumnIOFactory;
import parquet.io.MessageColumnIO;
import parquet.io.ParquetEncodingException;
import parquet.io.api.RecordConsumer;
import parquet.pig.PigMetaData;
import parquet.schema.MessageType;
import parquet.thrift.ParquetWriteProtocol;
import parquet.thrift.ThriftMetaData;
import parquet.thrift.ThriftSchemaConverter;
import parquet.thrift.struct.ThriftType.StructType;


public abstract class AbstractThriftWriteSupport extends WriteSupport {
  public static final String PARQUET_THRIFT_CLASS = "parquet.thrift.class";
  private static final Log LOG = Log.getLog(AbstractThriftWriteSupport.class);

  public static void setGenericThriftClass(Configuration configuration, Class thriftClass) {
    configuration.set(PARQUET_THRIFT_CLASS, thriftClass.getName());
  }

  public static Class getGenericThriftClass(Configuration configuration) {
    final String thriftClassName = configuration.get(PARQUET_THRIFT_CLASS);
    if (thriftClassName == null) {
      throw new BadConfigurationException("the thrift class conf is missing in job conf at " + PARQUET_THRIFT_CLASS);
    }

    try {
      @SuppressWarnings("unchecked")
      Class thriftClass = Class.forName(thriftClassName);
      return thriftClass;
    } catch (ClassNotFoundException e) {
      throw new BadConfigurationException("the class "+thriftClassName+" in job conf at " + PARQUET_THRIFT_CLASS + " could not be found", e);
    }
  }

  protected Class thriftClass;
  protected MessageType schema;
  protected StructType thriftStruct;
  protected ParquetWriteProtocol parquetWriteProtocol;
  protected WriteContext writeContext;

  /**
   * used from hadoop
   * the configuration must contain a thriftClass setting
   * @see AbstractThriftWriteSupport#setThriftClass(Configuration, Class)
   */
  public AbstractThriftWriteSupport() {
  }

  /**
   * @param thriftClass the thrift class used for writing values
   */
  public AbstractThriftWriteSupport(Class thriftClass) {
    init(thriftClass);
  }

  protected void init(Class thriftClass) {
    this.thriftClass = thriftClass;
    this.thriftStruct = getThriftStruct();

    ThriftSchemaConverter thriftSchemaConverter = new ThriftSchemaConverter();
    this.schema = thriftSchemaConverter.convert(thriftStruct);

    final Map extraMetaData = new ThriftMetaData(thriftClass.getName(), thriftStruct).toExtraMetaData();
    // adding the Pig schema as it would have been mapped from thrift
    // TODO: make this work for non-tbase types
    if (isPigLoaded() && TBase.class.isAssignableFrom(thriftClass)) {
      new PigMetaData(new ThriftToPig((Class>)thriftClass).toSchema()).addToMetaData(extraMetaData);
    }

    this.writeContext = new WriteContext(schema, extraMetaData);
  }

  protected boolean isPigLoaded() {
    try {
      Class.forName("org.apache.pig.impl.logicalLayer.schema.Schema");
      return true;
    } catch (ClassNotFoundException e) {
      LOG.info("Pig is not loaded, pig metadata will not be written");
      return false;
    }
  }

  @Override
  public WriteContext init(Configuration configuration) {
    if (writeContext == null) {
      init(getGenericThriftClass(configuration));
    }
    return writeContext;
  }

  @Override
  public void prepareForWrite(RecordConsumer recordConsumer) {
    final MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema);
    this.parquetWriteProtocol = new ParquetWriteProtocol(recordConsumer, columnIO, thriftStruct);
  }

  protected abstract StructType getThriftStruct();
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy