org.apache.kylin.streaming.CreateStreamingFlatTable.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of kylin-streaming
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.kylin.streaming

import java.nio.ByteBuffer

import org.apache.commons.lang3.StringUtils
import org.apache.kylin.common.KylinConfig
import org.apache.kylin.engine.spark.NSparkCubingEngine
import org.apache.kylin.engine.spark.builder.CreateFlatTable
import org.apache.kylin.engine.spark.job.{FlatTableHelper, NSparkCubingUtil}
import org.apache.kylin.metadata.cube.model.{NCubeJoinedFlatTableDesc, NDataSegment}
import org.apache.kylin.metadata.cube.utils.StreamingUtils
import org.apache.kylin.metadata.model._
import org.apache.kylin.parser.AbstractDataParser
import org.apache.kylin.source.SourceFactory
import org.apache.kylin.streaming.common.CreateFlatTableEntry
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.types._
import org.apache.spark.sql.util.SparderTypeUtil
import org.apache.spark.sql.{DataFrame, Dataset, Row}
import org.apache.spark.storage.StorageLevel

import scala.collection.JavaConverters._
import scala.collection.mutable

import org.apache.kylin.guava30.shaded.common.base.Preconditions

class CreateStreamingFlatTable(entry: CreateFlatTableEntry) extends
  CreateFlatTable(entry.flatTable, entry.seg, entry.toBuildTree, entry.ss, entry.sourceInfo) {

  import org.apache.kylin.engine.spark.builder.CreateFlatTable._

  private val MAX_OFFSETS_PER_TRIGGER = "maxOffsetsPerTrigger"
  private val STARTING_OFFSETS = "startingOffsets"
  private val SECURITY_PROTOCOL = "security.protocol"
  private val SASL_MECHANISM = "sasl.mechanism"

  var lookupTablesGlobal: mutable.LinkedHashMap[JoinTableDesc, Dataset[Row]] = _
  var factTableDataset: Dataset[Row] = _
  var tableRefreshInterval: Long = -1L

  def generateStreamingDataset(config: KylinConfig): Dataset[Row] = {
    val model = flatTable.getDataModel
    val tableDesc = model.getRootFactTable.getTableDesc
    val kafkaParam = tableDesc.getKafkaConfig.getKafkaParam
    val kafkaJobParams = config.getStreamingKafkaConfigOverride.asScala
    val securityProtocol = kafkaJobParams.get(SECURITY_PROTOCOL)
    if (securityProtocol.isDefined) {
      kafkaJobParams.remove(SECURITY_PROTOCOL)
      kafkaJobParams.put("kafka." + SECURITY_PROTOCOL, securityProtocol.get)
    }
    val saslMechanism = kafkaJobParams.get(SASL_MECHANISM)
    if (saslMechanism.isDefined) {
      kafkaJobParams.remove(SASL_MECHANISM)
      kafkaJobParams.put("kafka." + SASL_MECHANISM, saslMechanism.get)
    }
    kafkaJobParams.foreach { param =>
      param._1 match {
        case MAX_OFFSETS_PER_TRIGGER => if (param._2.toInt > 0) {
          val maxOffsetsPerTrigger = param._2.toInt
          kafkaParam.put("maxOffsetsPerTrigger", String.valueOf(maxOffsetsPerTrigger))
        }
        case STARTING_OFFSETS => if (!StringUtils.isEmpty(param._2)) {
          kafkaParam.put("startingOffsets", param._2)
        }
        case _ => kafkaParam.put(param._1, param._2)
      }
    }

    val originFactTable = SourceFactory
      .createEngineAdapter(tableDesc, classOf[NSparkCubingEngine.NSparkCubingSource])
      .getSourceData(tableDesc, ss, kafkaParam)

    val schema =
      StructType(
        tableDesc.getColumns.map { columnDescs =>
          StructField(columnDescs.getName, SparderTypeUtil.toSparkType(columnDescs.getType))
        }
      )
    val rootFactTable = changeSchemaToAliasDotName(
      CreateStreamingFlatTable.castDF(originFactTable, schema, partitionColumn(), entry.parserName).alias(model.getRootFactTable.getAlias),
      model.getRootFactTable.getAlias)

    factTableDataset =
      if (!StringUtils.isEmpty(entry.watermark)) {
        import org.apache.spark.sql.functions._
        val cols = model.getRootFactTable.getColumns.asScala.map(item => {
          col(NSparkCubingUtil.convertFromDot(item.getBackTickIdentity))
        }).toList
        rootFactTable.withWatermark(partitionColumn(), entry.watermark).groupBy(cols: _*).count()
      } else {
        rootFactTable
      }
    tableRefreshInterval = StreamingUtils.parseTableRefreshInterval(config.getStreamingTableRefreshInterval)
    loadLookupTables()
    joinFactTableWithLookupTables(factTableDataset, lookupTablesGlobal, model, ss)
  }

  def loadLookupTables(): Unit = {
    val ccCols = model().getRootFactTable.getColumns.asScala.filter(_.getColumnDesc.isComputedColumn).toSet
    val cleanLookupCC = cleanComputColumn(ccCols.toSeq, factTableDataset.columns.toSet)
    lookupTablesGlobal = generateLookupTableDataset(model(), cleanLookupCC, ss)
    lookupTablesGlobal.foreach { case (_, df) =>
      df.persist(StorageLevel.MEMORY_AND_DISK)
    }
  }

  def shouldRefreshTable(): Boolean = {
    tableRefreshInterval > 0
  }

  def model(): NDataModel = {
    flatTable.getDataModel
  }

  def partitionColumn(): String = {
    val partitionDateTableDotColumn = model().getPartitionDesc.getPartitionDateColumn
    val dotIdx = partitionDateTableDotColumn.lastIndexOf(".")
    Preconditions.checkArgument(dotIdx != -1)
    partitionDateTableDotColumn.substring(dotIdx + 1)
  }

  def encodeStreamingDataset(seg: NDataSegment, model: NDataModel, batchDataset: Dataset[Row]): Dataset[Row] = {
    val ccCols = model.getRootFactTable.getColumns.asScala.toSet
    val (dictCols, encodeCols): GlobalDictType = assemblyGlobalDictTuple(seg, toBuildTree)
    val encodedDataset = encodeWithCols(batchDataset, ccCols, dictCols, encodeCols)
    val filterEncodedDataset = FlatTableHelper.applyFilterCondition(flatTable, encodedDataset, needReplaceDot = true)

    flatTable match {
      case joined: NCubeJoinedFlatTableDesc =>
        changeSchemeToColumnIndice(filterEncodedDataset, joined)
      case unsupported =>
        throw new UnsupportedOperationException(
          s"Unsupported flat table desc type : ${unsupported.getClass}.")
    }
  }
}

object CreateStreamingFlatTable {
  def apply(createFlatTableEntry: CreateFlatTableEntry): CreateStreamingFlatTable = {
    new CreateStreamingFlatTable(createFlatTableEntry)
  }

  def castDF(df: DataFrame, parsedSchema: StructType, partitionColumn: String, parserName: String): DataFrame = {
    df.selectExpr("CAST(value AS STRING) as rawValue")
      .mapPartitions { rows =>
        val dataParser: AbstractDataParser[ByteBuffer] = AbstractDataParser
          .getDataParser(parserName, Thread.currentThread.getContextClassLoader)

        val newRows = new PartitionRowIterator(rows, parsedSchema, partitionColumn, dataParser)
        newRows.filter(row => row.size == parsedSchema.length)
      }(RowEncoder(parsedSchema))
  }
}