All Downloads are FREE. Search and download functionalities are using the official Maven repository.

wvlet.airframe.parquet.ParquetRecordReader.scala Maven / Gradle / Ivy

There is a newer version: 24.12.2
Show newest version
/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package wvlet.airframe.parquet

import org.apache.parquet.io.api.{Binary, Converter, GroupConverter, PrimitiveConverter, RecordMaterializer}
import org.apache.parquet.schema.{GroupType, MessageType}
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName
import org.apache.parquet.schema.LogicalTypeAnnotation.{jsonType, stringType}
import wvlet.airframe.codec.MessageCodec
import wvlet.airframe.codec.PrimitiveCodec.ValueCodec
import wvlet.airframe.surface.Surface
import wvlet.log.LogSupport

import scala.jdk.CollectionConverters.*

object ParquetRecordReader extends LogSupport {
  private class IntConverter(fieldName: String, holder: RecordBuilder) extends PrimitiveConverter {
    override def addInt(value: Int): Unit = {
      holder.add(fieldName, value)
    }
  }
  private class LongConverter(fieldName: String, holder: RecordBuilder) extends PrimitiveConverter {
    override def addLong(value: Long): Unit = {
      holder.add(fieldName, value)
    }
  }
  private class BooleanConverter(fieldName: String, holder: RecordBuilder) extends PrimitiveConverter {
    override def addBoolean(value: Boolean): Unit = {
      holder.add(fieldName, value)
    }
  }
  private class StringConverter(fieldName: String, holder: RecordBuilder) extends PrimitiveConverter with LogSupport {
    override def addBinary(value: Binary): Unit = {
      holder.add(fieldName, value.toStringUsingUTF8)
    }
  }
  private class FloatConverter(fieldName: String, holder: RecordBuilder) extends PrimitiveConverter {
    override def addFloat(value: Float): Unit = {
      holder.add(fieldName, value)
    }
  }
  private class DoubleConverter(fieldName: String, holder: RecordBuilder) extends PrimitiveConverter {
    override def addDouble(value: Double): Unit = {
      holder.add(fieldName, value)
    }
  }
  private class MsgPackConverter(fieldName: String, holder: RecordBuilder) extends PrimitiveConverter {
    override def addBinary(value: Binary): Unit = {
      val v = ValueCodec.fromMsgPack(value.getBytes)
      holder.add(fieldName, v)
    }
  }
  private class JsonConverter(fieldName: String, holder: RecordBuilder) extends PrimitiveConverter {
    override def addBinary(value: Binary): Unit = {
      val jsonStr = value.toStringUsingUTF8
      val obj: Any =
        if (jsonStr.startsWith("{") || jsonStr.endsWith("[")) {
          // Map to message pack value for handling nested objects
          ValueCodec.fromJson(jsonStr)
        } else {
          jsonStr
        }
      holder.add(fieldName, obj)
    }
  }

  case class ParentContext(paramName: String, recordBuilder: RecordBuilder)

}

import ParquetRecordReader.*

class ParquetRecordReader[A](
    surface: Surface,
    projectedSchema: GroupType,
    parentContext: Option[ParentContext] = None
) extends GroupConverter
    with LogSupport {
  private val codec         = MessageCodec.ofSurface(surface)
  private val recordBuilder = RecordBuilder.newBuilder

  private val converters: Seq[Converter] = projectedSchema.getFields.asScala.map { f =>
    val cv: Converter = f match {
      case p if p.isPrimitive =>
        p.asPrimitiveType().getPrimitiveTypeName match {
          case PrimitiveTypeName.INT32   => new IntConverter(f.getName, recordBuilder)
          case PrimitiveTypeName.INT64   => new LongConverter(f.getName, recordBuilder)
          case PrimitiveTypeName.BOOLEAN => new BooleanConverter(f.getName, recordBuilder)
          case PrimitiveTypeName.FLOAT   => new FloatConverter(f.getName, recordBuilder)
          case PrimitiveTypeName.DOUBLE  => new DoubleConverter(f.getName, recordBuilder)
          case PrimitiveTypeName.BINARY if p.getLogicalTypeAnnotation == stringType() =>
            new StringConverter(f.getName, recordBuilder)
          case PrimitiveTypeName.BINARY if p.getLogicalTypeAnnotation == jsonType() =>
            new JsonConverter(f.getName, recordBuilder)
          case PrimitiveTypeName.BINARY =>
            new MsgPackConverter(f.getName, recordBuilder)
          case _ => ???
        }
      case _ if surface.isMap =>
        // Mapping Parquet columns to non-object types (e.g., Map[String, Any])
        if (f.isPrimitive) {
          new MsgPackConverter(f.getName, recordBuilder)
        } else {
          // Mapping Parquet group types to non-object types
          new ParquetRecordReader(
            Surface.of[Map[String, Any]],
            f.asGroupType(),
            parentContext = Some(ParentContext(f.getName, recordBuilder))
          )
        }
      case _ =>
        // GroupConverter for nested objects
        surface.params.find(_.name == f.getName) match {
          case Some(param) =>
            if (param.surface.isOption || param.surface.isSeq || param.surface.isArray) {
              // For Option[X], Seq[X] types, extract X
              val elementSurface = param.surface.typeArgs(0)
              new ParquetRecordReader(
                param.surface,
                ParquetSchema.toParquetSchema(elementSurface),
                parentContext = Some(ParentContext(f.getName, recordBuilder))
              )
            } else {
              new ParquetRecordReader(
                param.surface,
                ParquetSchema.toParquetSchema(param.surface),
                parentContext = Some(ParentContext(f.getName, recordBuilder))
              )
            }
          case None =>
            ???
        }
    }
    cv
  }.toIndexedSeq

  def currentRecord: A = {
    val m = recordBuilder.toMap
    codec.fromMap(m).asInstanceOf[A]
  }

  override def getConverter(fieldIndex: Int): Converter = converters(fieldIndex)

  override def start(): Unit = {
    recordBuilder.clear()
  }

  override def end(): Unit = {
    parentContext.foreach { ctx =>
      val m = recordBuilder.toMap
      ctx.recordBuilder.add(ctx.paramName, m)
    }
  }
}

/**
  * An adapter class for org.apache.parquet.RecordMaterializer
  */
class ParquetRecordMaterializer[A](surface: Surface, projectedSchema: MessageType) extends RecordMaterializer[A] {
  private val recordConverter = new ParquetRecordReader[A](surface, projectedSchema)

  override def getCurrentRecord: A = recordConverter.currentRecord

  override def getRootConverter: GroupConverter = recordConverter
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy