com.twitter.scalding.parquet.scrooge.cascading.Parquet346ScroogeScheme.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of scalding-parquet-scrooge-cascading_2.10 Show documentation
Show all versions of scalding-parquet-scrooge-cascading_2.10 Show documentation
scalding-parquet-scrooge-cascading
The newest version!
package com.twitter.scalding.parquet.cascading.scrooge
import cascading.flow.FlowProcess
import cascading.tap.Tap
import com.twitter.scalding.parquet.cascading.ParquetValueScheme
import com.twitter.scalding.parquet.cascading.thrift.Parquet346StructTypeRepairer
import com.twitter.scrooge.{ ThriftStruct, ThriftStructCodec }
import org.apache.hadoop.mapred.{ JobConf, OutputCollector, RecordReader }
import org.apache.parquet.hadoop.thrift.ThriftReadSupport
import org.apache.parquet.schema.MessageType
import org.apache.parquet.thrift.struct.ThriftType.StructType
import org.apache.parquet.thrift.{ ThriftReader, ThriftRecordConverter }
import org.apache.thrift.protocol.TProtocol
import scala.util.control.NonFatal
/**
* This file contains workarounds for PARQUET-346, everything in it should
* be removed once that bug is fixed in upstream parquet.
*
* The root issue is that ScroogeRecordConverter passes a schema
* based on the file metadata to ThriftRecordConverter that may be missing
* structOrUnionType metadata. This metadata is not actually needed, but parquet
* currently throws if it's missing. The (temporary) "fix" is to populate this metadata
* by setting all structOrUnionType fields to UNION.
*/
/**
* The same as ParquetScroogeScheme, but sets the record convert to Parquet346ScroogeRecordConverter
*/
class Parquet346ScroogeScheme[T <: ThriftStruct](config: ParquetValueScheme.Config[T])
extends ParquetScroogeScheme[T](config) {
override def sourceConfInit(fp: FlowProcess[_ <: JobConf],
tap: Tap[JobConf, RecordReader[_, _], OutputCollector[_, _]],
jobConf: JobConf): Unit = {
super.sourceConfInit(fp, tap, jobConf)
// Use the fixed record converter instead of the one set in super
ThriftReadSupport.setRecordConverterClass(jobConf, classOf[Parquet346ScroogeRecordConverter[_]])
}
}
object Parquet346ScroogeRecordConverter {
/**
* Same as the (private) getCodec in ScroogeRecordConverter
*/
def getCodec[T <: ThriftStruct](klass: Class[T]): ThriftStructCodec[T] = {
try {
val companionClass = Class.forName(klass.getName + "$")
val companionObject: AnyRef = companionClass.getField("MODULE$").get(null)
companionObject.asInstanceOf[ThriftStructCodec[T]]
} catch {
case NonFatal(e) => throw new RuntimeException("Unable to create ThriftStructCodec", e)
}
}
}
/**
* Same as ScroogeRecordConverter with one important (subtle) difference.
* It passes a repaired schema (StructType) to ThriftRecordConverter's
* constructor. This is important because older files don't contain all the metadata needed for
* ThriftSchemaConverter to not throw, but we can put dummy data in there because it's not actually
* used.
*/
class Parquet346ScroogeRecordConverter[T <: ThriftStruct](thriftClass: Class[T],
parquetSchema: MessageType,
thriftType: StructType) extends ThriftRecordConverter[T](
// this is a little confusing because it's all being passed to the super constructor
// this thrift reader is the same as what's in ScroogeRecordConverter's constructor
new ThriftReader[T] {
val codec: ThriftStructCodec[T] = Parquet346ScroogeRecordConverter.getCodec(thriftClass)
def readOneRecord(protocol: TProtocol): T = codec.decode(protocol)
},
thriftClass.getSimpleName,
parquetSchema,
// this is the fix -- we add in the missing structOrUnionType metadata
// before passing it along
Parquet346StructTypeRepairer.repair(thriftType))