com.nvidia.spark.rapids.GpuGetJsonObject.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of rapids-4-spark_2.13 Show documentation
Show all versions of rapids-4-spark_2.13 Show documentation
Creates the distribution package of the RAPIDS plugin for Apache Spark
/*
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.nvidia.spark.rapids
import scala.collection.mutable
import scala.util.parsing.combinator.RegexParsers
import ai.rapids.cudf.{ColumnVector, GetJsonObjectOptions, Scalar}
import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource}
import com.nvidia.spark.rapids.RapidsPluginImplicits._
import com.nvidia.spark.rapids.jni.JSONUtils
import com.nvidia.spark.rapids.shims.ShimExpression
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.expressions.{Alias, ExpectsInputTypes, Expression, GetJsonObject}
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.rapids.GpuGetStructField
import org.apache.spark.sql.rapids.catalyst.expressions.{GpuCombinable, GpuExpressionCombiner, GpuExpressionEquals}
import org.apache.spark.sql.rapids.test.CpuGetJsonObject
import org.apache.spark.sql.types.{DataType, StringType, StructField, StructType}
import org.apache.spark.sql.vectorized.ColumnarBatch
import org.apache.spark.unsafe.types.UTF8String
import org.apache.spark.util.SerializableConfiguration
// Copied from Apache Spark org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
sealed trait PathInstruction
object PathInstruction {
case object Subscript extends PathInstruction
case object Wildcard extends PathInstruction
case object Key extends PathInstruction
case class Index(index: Long) extends PathInstruction
case class Named(name: String) extends PathInstruction
}
object JsonPathParser extends RegexParsers {
import PathInstruction._
def root: Parser[Char] = '$'
def long: Parser[Long] = "\\d+".r ^? {
case x => x.toLong
}
// parse `[*]` and `[123]` subscripts
def subscript: Parser[List[PathInstruction]] =
for {
operand <- '[' ~> ('*' ^^^ Wildcard | long ^^ Index) <~ ']'
} yield {
Subscript :: operand :: Nil
}
// parse `.name` or `['name']` child expressions
def named: Parser[List[PathInstruction]] =
for {
name <- '.' ~> "[^\\.\\[]+".r | "['" ~> "[^\\'\\?]+".r <~ "']"
} yield {
Key :: Named(name) :: Nil
}
// child wildcards: `..`, `.*` or `['*']`
def wildcard: Parser[List[PathInstruction]] =
(".*" | "['*']") ^^^ List(Wildcard)
def node: Parser[List[PathInstruction]] =
wildcard |
named |
subscript
val expression: Parser[List[PathInstruction]] = {
phrase(root ~> rep(node) ^^ (x => x.flatten))
}
def parse(str: String): Option[List[PathInstruction]] = {
this.parseAll(expression, str) match {
case Success(result, _) =>
Some(result)
case _ =>
None
}
}
def filterInstructionsForJni(instructions: List[PathInstruction]): List[PathInstruction] =
instructions.filter {
// The JNI implementation does not need/use these types.
case Subscript => false
case Key => false
case _ => true
}
def fallbackCheck(instructions: List[PathInstruction]): Boolean =
instructions.length > JSONUtils.MAX_PATH_DEPTH
def unzipInstruction(instruction: PathInstruction): (String, String, Long) = {
instruction match {
case Subscript => ("subscript", "", -1)
case Key => ("key", "", -1)
case Wildcard => ("wildcard", "", -1)
case Index(index) => ("index", "", index)
case Named(name) => ("named", name, -1)
}
}
def convertToJniObject(instructions: List[PathInstruction]):
Array[JSONUtils.PathInstructionJni] = {
instructions.map { instruction =>
val (tpe, name, index) = unzipInstruction(instruction)
new JSONUtils.PathInstructionJni(tpe match {
case "wildcard" => JSONUtils.PathInstructionType.WILDCARD
case "index" => JSONUtils.PathInstructionType.INDEX
case "named" => JSONUtils.PathInstructionType.NAMED
case other =>
throw new IllegalArgumentException(s"Internal Error should not see a $other instruction")
}, name, index)
}.toArray
}
def containsUnsupportedPath(instructions: List[PathInstruction]): Boolean = {
// Gpu GetJsonObject is not supported if JSON path contains wildcard [*]
// see https://github.com/NVIDIA/spark-rapids/issues/10216
instructions.exists {
case Wildcard => true
case Named("*") => true
case _ => false
}
}
def normalize(instructions: List[PathInstruction]): String = {
// convert List[PathInstruction] to String
"$" + instructions.map {
case Subscript | Key => ""
case Wildcard => "[*]"
case Index(index) => s"[$index]"
case Named(name) => s"['$name']"
case _ => throw new IllegalArgumentException(s"Invalid instruction in path")
}.mkString
}
}
class GpuGetJsonObjectMeta(
expr: GetJsonObject,
conf: RapidsConf,
parent: Option[RapidsMeta[_, _, _]],
rule: DataFromReplacementRule
) extends BinaryExprMeta[GetJsonObject](expr, conf, parent, rule) {
override def tagExprForGpu(): Unit = {
val lit = GpuOverrides.extractLit(expr.right)
lit.foreach { l =>
val instructions = JsonPathParser.parse(l.value.asInstanceOf[UTF8String].toString)
if (!conf.isLegacyGetJsonObjectEnabled) {
val updated = instructions.map(JsonPathParser.filterInstructionsForJni)
if (updated.exists(JsonPathParser.fallbackCheck)) {
willNotWorkOnGpu(s"get_json_object on GPU does not support more " +
s"than ${JSONUtils.MAX_PATH_DEPTH} nested paths." +
instructions.map(i => s" (Found ${i.length})").getOrElse(""))
}
} else {
if (instructions.exists(JsonPathParser.containsUnsupportedPath)) {
willNotWorkOnGpu("get_json_object on GPU does not support wildcard [*] in path")
}
}
}
}
override def convertToGpu(lhs: Expression, rhs: Expression): GpuExpression = {
if (!conf.isLegacyGetJsonObjectEnabled) {
GpuGetJsonObject(lhs, rhs)(
conf.testGetJsonObjectSavePath, conf.testGetJsonObjectSaveRows)
} else {
GpuGetJsonObjectLegacy(lhs, rhs)(
conf.testGetJsonObjectSavePath, conf.testGetJsonObjectSaveRows)
}
}
}
case class GpuMultiGetJsonObject(json: Expression,
paths: Seq[Option[List[PathInstruction]]],
output: StructType)(targetBatchSize: Long,
parallel: Option[Int])
extends GpuExpression with ShimExpression {
override def otherCopyArgs: Seq[AnyRef] =
targetBatchSize.asInstanceOf[java.lang.Long] ::
parallel ::
Nil
override def dataType: DataType = output
override def nullable: Boolean = false
override def prettyName: String = "multi_get_json_object"
lazy private val jniInstructions = paths.map { p =>
p.map(JsonPathParser.convertToJniObject)
}
override def columnarEval(batch: ColumnarBatch): GpuColumnVector = {
val nullIndexes = jniInstructions.zipWithIndex.filter {
case (None, _) => true
case _ => false
}.map(_._2)
val validPathsWithIndexes = jniInstructions.zipWithIndex.flatMap {
case (Some(arr), idx) => Some((java.util.Arrays.asList(arr: _*), idx))
case _ => None
}
val validPaths = validPathsWithIndexes.map(_._1)
withResource(new Array[ColumnVector](validPaths.length)) { validPathColumns =>
var validPathsIndex = 0
withResource(json.columnarEval(batch)) { input =>
// The get_json_object implementation will allocate an output that is as large
// as the input for each path being processed. This can cause memory to grow
// by a lot. We want to avoid this, but still try to run as many in parallel
// as we can
val p = parallel.getOrElse {
val inputSize = input.getBase.getDeviceMemorySize
// Our memory budget is 4x the target batch size. This is technically going
// to go over that, but in practice it is okay with the default settings
Math.max(Math.ceil((targetBatchSize * 4.0) / inputSize).toInt, 1)
}
validPaths.grouped(p).foreach { validPathChunk =>
withResource(JSONUtils.getJsonObjectMultiplePaths(input.getBase,
java.util.Arrays.asList(validPathChunk: _*))) { chunkedResult =>
chunkedResult.foreach { cr =>
validPathColumns(validPathsIndex) = cr.incRefCount()
validPathsIndex += 1
}
}
}
withResource(new Array[ColumnVector](paths.length)) { columns =>
if (nullIndexes.nonEmpty) {
val nullCol = withResource(GpuScalar.from(null, StringType)) { s =>
ColumnVector.fromScalar(s, batch.numRows())
}
withResource(nullCol) { _ =>
nullIndexes.foreach { idx =>
columns(idx) = nullCol.incRefCount()
}
}
}
validPathsWithIndexes.map(_._2).zipWithIndex.foreach {
case (toIndex, fromIndex) =>
columns(toIndex) = validPathColumns(fromIndex).incRefCount()
}
GpuColumnVector.from(ColumnVector.makeStruct(batch.numRows(), columns: _*), dataType)
}
}
}
}
override def children: Seq[Expression] = Seq(json)
}
class GetJsonObjectCombiner(private val exp: GpuGetJsonObject) extends GpuExpressionCombiner {
private var outputLocation = 0
/**
* A mapping between an expression and where in the output struct of
* the MultiGetJsonObject will the output be.
*/
private val toCombine = mutable.HashMap.empty[GpuExpressionEquals, Int]
addExpression(exp)
override def toString: String = s"GetJsonObjCombiner $toCombine"
override def hashCode: Int = {
// We already know that we are GetJsonObject, and what we can combine is based
// on the json column being the same.
"GetJsonObject".hashCode + (exp.json.semanticHash() * 17)
}
override def equals(o: Any): Boolean = o match {
case other: GetJsonObjectCombiner =>
exp.json.semanticEquals(other.exp.json) &&
// We don't support multi-get with the save path for verify yet
exp.savePathForVerify.isEmpty &&
other.exp.savePathForVerify.isEmpty
case _ => false
}
override def addExpression(e: Expression): Unit = {
val localOutputLocation = outputLocation
outputLocation += 1
val key = GpuExpressionEquals(e)
if (!toCombine.contains(key)) {
toCombine.put(key, localOutputLocation)
}
}
override def useCount: Int = toCombine.size
private def fieldName(id: Int): String =
s"_mgjo_$id"
@scala.annotation.tailrec
private def extractLit(exp: Expression): Option[GpuLiteral] = exp match {
case l: GpuLiteral => Some(l)
case a: Alias => extractLit(a.child)
case _ => None
}
private lazy val multiGet: GpuMultiGetJsonObject = {
val json = toCombine.head._1.e.asInstanceOf[GpuGetJsonObject].json
val fieldsNPaths = toCombine.toSeq.map {
case (k, id) =>
(id, k.e)
}.sortBy(_._1).map {
case (id, e: GpuGetJsonObject) =>
val parsedPath = extractLit(e.path).flatMap { s =>
import GpuGetJsonObject._
val str = s.value match {
case u: UTF8String => u.toString
case _ => null.asInstanceOf[String]
}
val pathInstructions = parseJsonPath(str)
if (hasSeparateWildcard(pathInstructions)) {
// If has separate wildcard path, should return all nulls
None
} else {
// Filter out the unneeded instructions before we cache it
pathInstructions.map(JsonPathParser.filterInstructionsForJni)
}
}
(StructField(fieldName(id), e.dataType, e.nullable), parsedPath)
}
val dt = StructType(fieldsNPaths.map(_._1))
val conf = SQLConf.get
val targetBatchSize = RapidsConf.GPU_BATCH_SIZE_BYTES.get(conf)
val tmp = conf.getConfString("spark.sql.test.multiget.parallel", null)
val parallel = Option(tmp).map(_.toInt)
GpuMultiGetJsonObject(json, fieldsNPaths.map(_._2), dt)(targetBatchSize, parallel)
}
override def getReplacementExpression(e: Expression): Expression = {
val localId = toCombine(GpuExpressionEquals(e))
GpuGetStructField(multiGet, localId, Some(fieldName(localId)))
}
}
object GpuGetJsonObject {
def parseJsonPath(path: GpuScalar): Option[List[PathInstruction]] = {
if (path.isValid) {
val pathStr = path.getValue.toString
JsonPathParser.parse(pathStr)
} else {
None
}
}
def parseJsonPath(pathStr: String): Option[List[PathInstruction]] = {
if (pathStr != null) {
JsonPathParser.parse(pathStr)
} else {
None
}
}
/**
* get_json_object(any_json, '$.*') always return null.
* '$.*' will be parsed to be a single `Wildcard`.
* Check whether has separated `Wildcard`
*
* @param instructions query path instructions
* @return true if has separated `Wildcard`, false otherwise.
*/
def hasSeparateWildcard(instructions: Option[List[PathInstruction]]): Boolean = {
import PathInstruction._
def hasSeparate(ins: List[PathInstruction], idx: Int): Boolean = {
if (idx == 0) {
ins(0) match {
case Wildcard => true
case _ => false
}
} else {
(ins(idx - 1), ins(idx)) match {
case (Key, Wildcard) => false
case (Subscript, Wildcard) => false
case (_, Wildcard) => true
case _ => false
}
}
}
if (instructions.isEmpty) {
false
} else {
val list = instructions.get
list.indices.exists { idx => hasSeparate(list, idx) }
}
}
}
case class GpuGetJsonObject(
json: Expression,
path: Expression)(
val savePathForVerify: Option[String],
val saveRowsForVerify: Int)
extends GpuBinaryExpressionArgsAnyScalar
with ExpectsInputTypes
with GpuCombinable {
import GpuGetJsonObject._
// Get a Hadoop conf for the JSON Object
val hconf: Option[SerializableConfiguration] = savePathForVerify.map { _ =>
val spark = SparkSession.active
new SerializableConfiguration(spark.sparkContext.hadoopConfiguration)
}
val seed = System.nanoTime()
override def otherCopyArgs: Seq[AnyRef] = Seq(savePathForVerify,
saveRowsForVerify.asInstanceOf[java.lang.Integer])
override def left: Expression = json
override def right: Expression = path
override def dataType: DataType = StringType
override def inputTypes: Seq[DataType] = Seq(StringType, StringType)
override def nullable: Boolean = true
override def prettyName: String = "get_json_object"
private var cachedInstructions:
Option[Option[List[PathInstruction]]] = None
override def doColumnar(lhs: GpuColumnVector, rhs: GpuScalar): ColumnVector = {
val fromGpu = cachedInstructions.getOrElse {
val pathInstructions = parseJsonPath(rhs)
val checkedPathInstructions = if (hasSeparateWildcard(pathInstructions)) {
// If has separate wildcard path, should return all nulls
None
} else {
// Filter out the unneeded instructions before we cache it
pathInstructions.map(JsonPathParser.filterInstructionsForJni)
}
cachedInstructions = Some(checkedPathInstructions)
checkedPathInstructions
} match {
case Some(instructions) => {
val jniInstructions = JsonPathParser.convertToJniObject(instructions)
JSONUtils.getJsonObject(lhs.getBase, jniInstructions)
}
case None => GpuColumnVector.columnVectorFromNull(lhs.getRowCount.toInt, StringType)
}
// Below is only for testing purpose
savePathForVerify.foreach { debugPath =>
closeOnExcept(fromGpu) { _ =>
val path = rhs.getValue.asInstanceOf[UTF8String]
withResource(CpuGetJsonObject.getJsonObjectOnCpu(lhs, path)) { fromCpu =>
// verify result, save diffs if have
CpuGetJsonObject.verify(isLegacy = false, seed,
lhs.getBase, path, fromGpu, fromCpu, debugPath, saveRowsForVerify,
hconf.get.value)
}
}
}
fromGpu
}
override def doColumnar(numRows: Int, lhs: GpuScalar, rhs: GpuScalar): ColumnVector = {
withResource(GpuColumnVector.from(lhs, numRows, left.dataType)) { expandedLhs =>
doColumnar(expandedLhs, rhs)
}
}
@transient
private lazy val combiner = new GetJsonObjectCombiner(this)
override def getCombiner(): GpuExpressionCombiner = combiner
}
case class GpuGetJsonObjectLegacy(
json: Expression,
path: Expression)(
savePathForVerify: Option[String],
saveRowsForVerify: Int)
extends GpuBinaryExpressionArgsAnyScalar
with ExpectsInputTypes {
// Get a Hadoop conf for the JSON Object
val hconf: Option[SerializableConfiguration] = savePathForVerify.map { _ =>
val spark = SparkSession.active
new SerializableConfiguration(spark.sparkContext.hadoopConfiguration)
}
val seed = System.nanoTime()
override def otherCopyArgs: Seq[AnyRef] = Seq(savePathForVerify,
saveRowsForVerify.asInstanceOf[java.lang.Integer])
override def left: Expression = json
override def right: Expression = path
override def dataType: DataType = StringType
override def inputTypes: Seq[DataType] = Seq(StringType, StringType)
override def nullable: Boolean = true
override def prettyName: String = "get_json_object"
private var cachedNormalizedPath: Option[Option[String]] = None
def normalizeJsonPath(path: GpuScalar): Option[String] = {
if (path.isValid) {
val pathStr = path.getValue.toString()
JsonPathParser.parse(pathStr).map(JsonPathParser.normalize)
} else {
None
}
}
override def doColumnar(lhs: GpuColumnVector, rhs: GpuScalar): ColumnVector = {
val fromGpu = cachedNormalizedPath.getOrElse {
val normalizedPath: Option[String] = normalizeJsonPath(rhs)
cachedNormalizedPath = Some(normalizedPath)
normalizedPath
} match {
case Some(normalizedStr) =>
withResource(Scalar.fromString(normalizedStr)) { scalar =>
lhs.getBase().getJSONObject(scalar,
GetJsonObjectOptions.builder().allowSingleQuotes(true).build())
}
case None => GpuColumnVector.columnVectorFromNull(lhs.getRowCount.toInt, StringType)
}
// Below is only for testing purpose
savePathForVerify.foreach { debugPath =>
closeOnExcept(fromGpu) { _ =>
val path = rhs.getValue.asInstanceOf[UTF8String]
withResource(CpuGetJsonObject.getJsonObjectOnCpu(lhs, path)) { fromCpu =>
// verify result, save diffs if have
CpuGetJsonObject.verify(isLegacy = true, seed,
lhs.getBase, path, fromGpu, fromCpu, debugPath, saveRowsForVerify,
hconf.get.value)
}
}
}
fromGpu
}
override def doColumnar(numRows: Int, lhs: GpuScalar, rhs: GpuScalar): ColumnVector = {
withResource(GpuColumnVector.from(lhs, numRows, left.dataType)) { expandedLhs =>
doColumnar(expandedLhs, rhs)
}
}
}