org.apache.kylin.source.kafka.NSparkKafkaSource.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of kylin-streaming
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.kylin.source.kafka

import java.util.Map
import java.util.concurrent.ArrayBlockingQueue

import org.apache.commons.lang3.StringUtils
import org.apache.kylin.common.KylinConfig
import org.apache.kylin.engine.spark.NSparkCubingEngine
import org.apache.kylin.metadata.model.{IBuildable, SegmentRange, TableDesc}
import org.apache.kylin.source.{IReadableTable, ISampleDataDeployer, ISource, ISourceMetadataExplorer}
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.execution.streaming.MemoryStream
import org.apache.spark.sql.types.{StringType, StructType}
import org.apache.spark.sql.{Dataset, Row, SparkSession}

import scala.io.Source


class NSparkKafkaSource(val kylinConfig: KylinConfig) extends ISource {
  private val textFileQueue = new ArrayBlockingQueue[String](1000)
  private var msEvents: MemoryStream[Row] = null
  private var memoryStreamEnabled = false

  // scalastyle:off

  /**
   * Return an explorer to sync table metadata from the data source.
   */
  override def getSourceMetadataExplorer: ISourceMetadataExplorer = {
    new KafkaExplorer()
  }

  /**
   * Return an adaptor that implements specified interface as requested by the build engine.
   * The IMRInput in particular, is required by the MR build engine.
   */
  override def adaptToBuildEngine[I](engineInterface: Class[I]): I = {
    if (engineInterface eq classOf[NSparkCubingEngine.NSparkCubingSource]) {
      val source = new NSparkCubingEngine.NSparkCubingSource() {
        override def getSourceData(table: TableDesc, ss: SparkSession, parameters: Map[String, String]): Dataset[Row] = {
          if (KylinConfig.getInstanceFromEnv.isUTEnv) {
            val schema = new StructType().add("value", StringType)
            if (memoryStreamEnabled) {
              msEvents = MemoryStream[Row](1)(RowEncoder(schema), ss.sqlContext)
              val text = Source.fromFile(textFileQueue.take())
              msEvents.addData(Row(text.getLines().mkString))
              msEvents.toDS()
            } else {
              val rdd = ss.read.text(textFileQueue.take()).rdd
              val dataframe = ss.createDataFrame(rdd, schema)
              dataframe.as[Row](RowEncoder(schema))
            }
          } else {
            ss.readStream.format("kafka").options(parameters).load()
          }
        }
      }
      return source.asInstanceOf[I]
    }
    throw new IllegalArgumentException("Unsupported engine interface: " + engineInterface)
  }

  /**
   * Return a ReadableTable that can iterate through the rows of given table.
   */
  override def createReadableTable(tableDesc: TableDesc): IReadableTable = {
    throw new UnsupportedOperationException
  }

  /**
   * Give the source a chance to enrich a SourcePartition before build start.
   * Particularly, Kafka source use this chance to define start/end offsets within each partition.
   */
  override def enrichSourcePartitionBeforeBuild(buildable: IBuildable, segmentRange:
  SegmentRange[_ <: Comparable[_]]): SegmentRange[_ <: Comparable[_]] = {
    throw new UnsupportedOperationException
  }

  /**
   * Return an object that is responsible for deploying sample (CSV) data to the source database.
   * For testing purpose.
   */
  override def getSampleDataDeployer: ISampleDataDeployer = {
    throw new UnsupportedOperationException
  }

  override def getSegmentRange(start: String, end: String): SegmentRange[_ <: Comparable[_]] = {
    var startTime = start
    var endTime = end
    if (StringUtils.isEmpty(start)) startTime = "0"
    if (StringUtils.isEmpty(end)) endTime = "" + Long.MaxValue
    new SegmentRange.KafkaOffsetPartitionedSegmentRange(startTime.toLong, endTime.toLong)
  }

  def enableMemoryStream(): Boolean = {
    this.memoryStreamEnabled
  }

  def enableMemoryStream(mse: Boolean): Unit = {
    this.memoryStreamEnabled = mse
  }

  def post(textFile: String): Unit = {
    if (msEvents != null && memoryStreamEnabled) {
      val text = Source.fromFile(textFile)
      msEvents.addData(Row(text.getLines().mkString))
    } else {
      textFileQueue.offer(textFile)
    }
  }

  override def supportBuildSnapShotByPartition = true

  // scalastyle:on
}