com.twitter.scalding.parquet.cascading.thrift.Parquet346TBaseScheme.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of scalding-parquet-cascading_2.10 Show documentation
scalding-parquet-cascading
The newest version!
package com.twitter.scalding.parquet.cascading.thrift

import com.twitter.scalding.parquet.cascading.ParquetValueScheme

import cascading.flow.FlowProcess
import cascading.tap.Tap
import org.apache.hadoop.mapred.{ JobConf, OutputCollector, RecordReader }
import org.apache.parquet.hadoop.thrift.ThriftReadSupport
import org.apache.parquet.io.ParquetDecodingException
import org.apache.parquet.schema.MessageType
import org.apache.parquet.thrift.struct.ThriftType.StructType.StructOrUnionType
import org.apache.parquet.thrift.struct.ThriftType._
import org.apache.parquet.thrift.struct.{ ThriftField, ThriftType }
import org.apache.parquet.thrift.{ ThriftReader, ThriftRecordConverter }
import org.apache.thrift.TBase
import org.apache.thrift.protocol.TProtocol

import scala.collection.JavaConverters._

/**
 * This file contains workarounds for PARQUET-346, everything in it should
 * be removed once that bug is fixed in upstream parquet.
 *
 * The root issue is that TBaseRecordConverter passes a schema
 * based on the file metadata to ThriftRecordConverter that may be missing
 * structOrUnionType metadata. This metadata is not actually needed, but parquet
 * currently throws if it's missing. The (temporary) "fix" is to populate this metadata
 * by setting all structOrUnionType fields to UNION.
 */

/**
 * The same as ParquetTBaseScheme, but sets the record convert to Parquet346TBaseRecordConverter
 */
class Parquet346TBaseScheme[T <: TBase[_, _]](config: ParquetValueScheme.Config[T])
  extends ParquetTBaseScheme[T](config) {

  override def sourceConfInit(fp: FlowProcess[_ <: JobConf],
    tap: Tap[JobConf, RecordReader[_, _], OutputCollector[_, _]],
    jobConf: JobConf): Unit = {

    super.sourceConfInit(fp, tap, jobConf)

    // Use the fixed record converter instead of the one set in super
    ThriftReadSupport.setRecordConverterClass(jobConf, classOf[Parquet346TBaseRecordConverter[_]])
  }
}

/**
 * Same as TBaseRecordConverter with one important (subtle) difference.
 * It passes a repaired schema (StructType) to ThriftRecordConverter's
 * constructor. This is important because older files don't contain all the metadata needed for
 * ThriftSchemaConverter to not throw, but we can put dummy data in there because it's not actually
 * used.
 */
class Parquet346TBaseRecordConverter[T <: TBase[_, _]](thriftClass: Class[T],
  requestedParquetSchema: MessageType, thriftType: ThriftType.StructType) extends ThriftRecordConverter[T](
  // this is a little confusing because it's all being passed to the super constructor

  // this thrift reader is the same as what's in ScroogeRecordConverter's constructor
  new ThriftReader[T] {
    override def readOneRecord(protocol: TProtocol): T = {
      try {
        val thriftObject: T = thriftClass.newInstance
        thriftObject.read(protocol)
        thriftObject
      } catch {
        case e: InstantiationException =>
          throw new ParquetDecodingException("Could not instantiate Thrift " + thriftClass, e)
        case e: IllegalAccessException =>
          throw new ParquetDecodingException("Thrift class or constructor not public " + thriftClass, e)
      }
    }
  },
  thriftClass.getSimpleName,
  requestedParquetSchema,

  // this is the fix -- we add in the missing structOrUnionType metadata
  // before passing it along
  Parquet346StructTypeRepairer.repair(thriftType))

/**
 * Takes a ThriftType with potentially missing structOrUnionType metadata,
 * and makes a copy that sets all StructOrUnionType metadata to UNION
 */
object Parquet346StructTypeRepairer extends StateVisitor[ThriftType, Unit] {

  def repair(fromMetadata: StructType): StructType = {
    visit(fromMetadata, ())
  }

  def copyRecurse(field: ThriftField): ThriftField = {
    new ThriftField(field.getName, field.getFieldId, field.getRequirement, field.getType.accept(this, ()))
  }

  override def visit(structType: StructType, state: Unit): StructType = {
    val repairedChildren = structType
      .getChildren
      .asScala
      .iterator
      .map(copyRecurse)

    new StructType(repairedChildren.toBuffer.asJava, StructOrUnionType.UNION)
  }

  override def visit(mapType: MapType, state: Unit): MapType =
    new MapType(copyRecurse(mapType.getKey), copyRecurse(mapType.getValue))

  override def visit(setType: SetType, state: Unit): SetType =
    new SetType(copyRecurse(setType.getValues))

  override def visit(listType: ListType, state: Unit): ListType =
    new ListType(copyRecurse(listType.getValues))

  override def visit(enumType: EnumType, state: Unit): EnumType = enumType

  override def visit(boolType: BoolType, state: Unit): BoolType = boolType

  override def visit(byteType: ByteType, state: Unit): ByteType = byteType

  override def visit(doubleType: DoubleType, state: Unit): DoubleType = doubleType

  override def visit(i16Type: I16Type, state: Unit): I16Type = i16Type

  override def visit(i32Type: I32Type, state: Unit): I32Type = i32Type

  override def visit(i64Type: I64Type, state: Unit): I64Type = i64Type

  override def visit(stringType: StringType, state: Unit): StringType = stringType
}