org.apache.paimon.spark.PaimonStatistics.scala Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.paimon.spark
import org.apache.paimon.spark.data.SparkInternalRow
import org.apache.paimon.stats.ColStats
import org.apache.paimon.types.{DataField, DataType, RowType}
import org.apache.spark.sql.PaimonUtils
import org.apache.spark.sql.catalyst.plans.logical.ColumnStat
import org.apache.spark.sql.connector.expressions.NamedReference
import org.apache.spark.sql.connector.read.Statistics
import org.apache.spark.sql.connector.read.colstats.ColumnStatistics
import java.util.{Optional, OptionalLong}
import scala.collection.JavaConverters._
case class PaimonStatistics[T <: PaimonBaseScan](scan: T) extends Statistics {
private lazy val rowCount: Long = scan.lazyInputPartitions.map(_.rowCount()).sum
private lazy val scannedTotalSize: Long = rowCount * scan.readSchema().defaultSize
private lazy val paimonStats = if (scan.statistics.isPresent) scan.statistics.get() else null
lazy val paimonStatsEnabled: Boolean = {
paimonStats != null &&
paimonStats.mergedRecordSize().isPresent &&
paimonStats.mergedRecordCount().isPresent
}
private def getSizeForField(field: DataField): Long = {
Option(paimonStats.colStats().get(field.name()))
.map(_.avgLen())
.filter(_.isPresent)
.map(_.getAsLong)
.getOrElse(field.`type`().defaultSize().toLong)
}
private def getSizeForRow(schema: RowType): Long = {
schema.getFields.asScala.map(field => getSizeForField(field)).sum
}
override def sizeInBytes(): OptionalLong = {
if (!paimonStatsEnabled) {
return OptionalLong.of(scannedTotalSize)
}
val wholeSchemaSize = getSizeForRow(scan.tableRowType)
val requiredDataSchemaSize =
scan.readTableRowType.getFields.asScala.map(field => getSizeForField(field)).sum
val requiredDataSizeInBytes =
paimonStats.mergedRecordSize().getAsLong * (requiredDataSchemaSize.toDouble / wholeSchemaSize)
val metadataSchemaSize =
scan.metadataColumns.map(field => getSizeForField(field.toPaimonDataField)).sum
val metadataSizeInBytes = paimonStats.mergedRecordCount().getAsLong * metadataSchemaSize
val sizeInBytes = (requiredDataSizeInBytes + metadataSizeInBytes).toLong
// Avoid return 0 bytes if there are some valid rows.
// Avoid return too small size in bytes which may less than row count,
// note the compression ratio on disk is usually bigger than memory.
val normalized = Math.max(sizeInBytes, paimonStats.mergedRecordCount().getAsLong)
OptionalLong.of(normalized)
}
override def numRows(): OptionalLong =
if (paimonStatsEnabled) paimonStats.mergedRecordCount() else OptionalLong.of(rowCount)
override def columnStats(): java.util.Map[NamedReference, ColumnStatistics] = {
val requiredFields = scan.requiredStatsSchema.fieldNames
val resultMap = new java.util.HashMap[NamedReference, ColumnStatistics]()
if (paimonStatsEnabled) {
val paimonColStats = paimonStats.colStats()
scan.tableRowType.getFields.asScala
.filter {
field => requiredFields.contains(field.name) && paimonColStats.containsKey(field.name())
}
.foreach {
field =>
resultMap.put(
PaimonUtils.fieldReference(field.name()),
PaimonColumnStats(field.`type`(), paimonColStats.get(field.name()))
)
}
}
resultMap
}
}
case class PaimonColumnStats(
override val nullCount: OptionalLong,
override val min: Optional[Object],
override val max: Optional[Object],
override val distinctCount: OptionalLong,
override val avgLen: OptionalLong,
override val maxLen: OptionalLong)
extends ColumnStatistics
object PaimonColumnStats {
def apply(dateType: DataType, paimonColStats: ColStats[_]): PaimonColumnStats = {
PaimonColumnStats(
paimonColStats.nullCount,
Optional.ofNullable(
DataConverter
.fromPaimon(paimonColStats.min().orElse(null), dateType)),
Optional.ofNullable(DataConverter.fromPaimon(paimonColStats.max().orElse(null), dateType)),
paimonColStats.distinctCount,
paimonColStats.avgLen,
paimonColStats.maxLen
)
}
def apply(v1ColStats: ColumnStat): PaimonColumnStats = {
import PaimonImplicits._
PaimonColumnStats(
if (v1ColStats.nullCount.isDefined) OptionalLong.of(v1ColStats.nullCount.get.longValue)
else OptionalLong.empty(),
v1ColStats.min,
v1ColStats.max,
if (v1ColStats.distinctCount.isDefined)
OptionalLong.of(v1ColStats.distinctCount.get.longValue)
else OptionalLong.empty(),
if (v1ColStats.avgLen.isDefined) OptionalLong.of(v1ColStats.avgLen.get.longValue())
else OptionalLong.empty(),
if (v1ColStats.maxLen.isDefined) OptionalLong.of(v1ColStats.maxLen.get.longValue())
else OptionalLong.empty()
)
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy