com.johnsnowlabs.nlp.annotators.seq2seq.AutoGGUFVisionModel.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of spark-nlp_2.12 Show documentation
spark-nlp
There is a newer version: 6.0.3
/*
 * Copyright 2017-2024 John Snow Labs
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.johnsnowlabs.nlp.annotators.seq2seq

import com.johnsnowlabs.ml.gguf.GGUFWrapperMultiModal
import com.johnsnowlabs.ml.util.LlamaCPP
import com.johnsnowlabs.nlp._
import com.johnsnowlabs.nlp.annotators.cv.util.io.ImageIOUtils
import com.johnsnowlabs.nlp.llama.{LlamaException, LlamaModel}
import com.johnsnowlabs.nlp.util.io.ResourceHelper
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.SparkSession

/** Multimodal annotator that uses the llama.cpp library to generate text completions with large
  * language models. It supports ingesting images for captioning.
  *
  * At the moment only CLIP based models are supported.
  *
  * For settable parameters, and their explanations, see [[HasLlamaCppInferenceProperties]],
  * [[HasLlamaCppModelProperties]] and refer to the llama.cpp documentation of
  * [[https://github.com/ggerganov/llama.cpp/tree/7d5e8777ae1d21af99d4f95be10db4870720da91/examples/server server.cpp]]
  * for more information.
  *
  * If the parameters are not set, the annotator will default to use the parameters provided by
  * the model.
  *
  * This annotator expects a column of annotator type [[AnnotationImage]] for the image and
  * [[Annotation]] for the caption. Note that the image bytes in the image annotation need to be
  * raw image bytes without preprocessing. We provide the helper function
  * [[ImageAssembler.loadImagesAsBytes]] to load the image bytes from a directory.
  *
  * Pretrained models can be loaded with `pretrained` of the companion object:
  * {{{
  * val autoGGUFVisionModel = AutoGGUFVisionModel.pretrained()
  *   .setInputCols("image', "document")
  *   .setOutputCol("completions")
  * }}}
  * The default model is `"llava_v1.5_7b_Q4_0_gguf"`, if no name is provided.
  *
  * For available pretrained models please see the [[https://sparknlp.org/models Models Hub]].
  *
  * For extended examples of usage, see the
  * [[https://github.com/JohnSnowLabs/spark-nlp/tree/master/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFVisionModelTest.scala AutoGGUFVisionModelTest]]
  * and the
  * [[https://github.com/JohnSnowLabs/spark-nlp/tree/master/examples/python/llama.cpp/llama.cpp_in_Spark_NLP_AutoGGUFVisionModel.ipynb example notebook]].
  *
  * ==Note==
  * To use GPU inference with this annotator, make sure to use the Spark NLP GPU package and set
  * the number of GPU layers with the `setNGpuLayers` method.
  *
  * When using larger models, we recommend adjusting GPU usage with `setNCtx` and `setNGpuLayers`
  * according to your hardware to avoid out-of-memory errors.
  *
  * ==Example==
  *
  * {{{
  * import com.johnsnowlabs.nlp.ImageAssembler
  * import com.johnsnowlabs.nlp.annotator._
  * import com.johnsnowlabs.nlp.base._
  * import org.apache.spark.ml.Pipeline
  * import org.apache.spark.sql.DataFrame
  * import org.apache.spark.sql.functions.lit
  *
  * val documentAssembler = new DocumentAssembler()
  *   .setInputCol("caption")
  *   .setOutputCol("caption_document")
  *
  * val imageAssembler = new ImageAssembler()
  *   .setInputCol("image")
  *   .setOutputCol("image_assembler")
  *
  * val imagesPath = "src/test/resources/image/"
  * val data: DataFrame = ImageAssembler
  *   .loadImagesAsBytes(ResourceHelper.spark, imagesPath)
  *   .withColumn("caption", lit("Caption this image.")) // Add a caption to each image.
  *
  * val nPredict = 40
  * val model = AutoGGUFVisionModel.pretrained()
  *   .setInputCols("caption_document", "image_assembler")
  *   .setOutputCol("completions")
  *   .setBatchSize(4)
  *   .setNGpuLayers(99)
  *   .setNCtx(4096)
  *   .setMinKeep(0)
  *   .setMinP(0.05f)
  *   .setNPredict(nPredict)
  *   .setNProbs(0)
  *   .setPenalizeNl(false)
  *   .setRepeatLastN(256)
  *   .setRepeatPenalty(1.18f)
  *   .setStopStrings(Array("", "Llama:", "User:"))
  *   .setTemperature(0.05f)
  *   .setTfsZ(1)
  *   .setTypicalP(1)
  *   .setTopK(40)
  *   .setTopP(0.95f)
  *
  * val pipeline = new Pipeline().setStages(Array(documentAssembler, imageAssembler, model))
  * pipeline
  *   .fit(data)
  *   .transform(data)
  *   .selectExpr("reverse(split(image.origin, '/'))[0] as image_name", "completions.result")
  *   .show(truncate = false)
  * +-----------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
  * |image_name       |result                                                                                                                                                                                        |
  * +-----------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
  * |palace.JPEG      |[ The image depicts a large, ornate room with high ceilings and beautifully decorated walls. There are several chairs placed throughout the space, some of which have cushions]               |
  * |egyptian_cat.jpeg|[ The image features two cats lying on a pink surface, possibly a bed or sofa. One cat is positioned towards the left side of the scene and appears to be sleeping while holding]             |
  * |hippopotamus.JPEG|[ A large brown hippo is swimming in a body of water, possibly an aquarium. The hippo appears to be enjoying its time in the water and seems relaxed as it floats]                            |
  * |hen.JPEG         |[ The image features a large chicken standing next to several baby chickens. In total, there are five birds in the scene: one adult and four young ones. They appear to be gathered together] |
  * |ostrich.JPEG     |[ The image features a large, long-necked bird standing in the grass. It appears to be an ostrich or similar species with its head held high and looking around. In addition to]              |
  * |junco.JPEG       |[ A small bird with a black head and white chest is standing on the snow. It appears to be looking at something, possibly food or another animal in its vicinity. The scene takes place out]  |
  * |bluetick.jpg     |[ A dog with a red collar is sitting on the floor, looking at something. The dog appears to be staring into the distance or focusing its attention on an object in front of it.]              |
  * |chihuahua.jpg    |[ A small brown dog wearing a sweater is sitting on the floor. The dog appears to be looking at something, possibly its owner or another animal in the room. It seems comfortable and relaxed]|
  * |tractor.JPEG     |[ A man is sitting in the driver's seat of a green tractor, which has yellow wheels and tires. The tractor appears to be parked on top of an empty field with]                                |
  * |ox.JPEG          |[ A large bull with horns is standing in a grassy field.]                                                                                                                                     |
  * +-----------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
  * }}}
  *
  * @param uid
  *   required uid for storing annotator to disk
  * @groupname anno Annotator types
  * @groupdesc anno
  *   Required input and expected output annotator types
  * @groupname Ungrouped Members
  * @groupname param Parameters
  * @groupname setParam Parameter setters
  * @groupname getParam Parameter getters
  * @groupname Ungrouped Members
  * @groupprio param  1
  * @groupprio anno  2
  * @groupprio Ungrouped 3
  * @groupprio setParam  4
  * @groupprio getParam  5
  * @groupdesc param
  *   A list of (hyper-)parameter keys this annotator can take. Users can set and get the
  *   parameter values through setters and getters, respectively.
  */
class AutoGGUFVisionModel(override val uid: String)
    extends AnnotatorModel[AutoGGUFVisionModel]
    with HasBatchedAnnotateTextImage[AutoGGUFVisionModel]
    with HasEngine
    with HasLlamaCppModelProperties
    with HasLlamaCppInferenceProperties
    with HasProtectedParams {

  override val inputAnnotatorTypes: Array[AnnotatorType] =
    Array(AnnotatorType.IMAGE, AnnotatorType.DOCUMENT)
  override val outputAnnotatorType: AnnotatorType = AnnotatorType.DOCUMENT

  /** Annotator reference id. Used to identify elements in metadata or to refer to this annotator
    * type
    */
  def this() = this(Identifiable.randomUID("AutoGGUFVisionModel"))

  private var _model: Option[Broadcast[GGUFWrapperMultiModal]] = None

  /** @group getParam */
  def getModelIfNotSet: GGUFWrapperMultiModal = _model.get.value

  /** @group setParam */
  def setModelIfNotSet(spark: SparkSession, wrapper: GGUFWrapperMultiModal): this.type = {
    if (_model.isEmpty) {
      _model = Some(spark.sparkContext.broadcast(wrapper))
    }

    // Entrypoint for models. Automatically set GPU support if detected.
    setGpuSupportIfAvailable(spark)
    this
  }

  private[johnsnowlabs] def setEngine(engineName: String): this.type = set(engine, engineName)

  /** Sets the number of parallel processes for decoding. This is an alias for `setBatchSize`.
    *
    * @group setParam
    * @param nParallel
    *   The number of parallel processes for decoding
    */
  def setNParallel(nParallel: Int): this.type = {
    setBatchSize(nParallel)
  }

  setDefault(
    engine -> LlamaCPP.name,
    useChatTemplate -> true,
    nCtx -> 4096,
    nBatch -> 512,
    embedding -> false,
    nPredict -> 100)

//  val mmproj = new Param[String](
//    this,
//    "mmproj",
//    "Name of the file for the multi-modal projection (mmproj) model, that encodes the images.")
//
//  /** Sets the path to the multi-modal projection (mmproj) model, that encodes the images.
//    *
//    * Should only be used by this class and not by the user.
//    *
//    * @param value
//    *   Name of the file for the multi-modal projection (mmproj) model
//    * @return
//    */
//  private def setMmproj(value: String): this.type = set(mmproj, value)
//
//  private def getMmproj: String = $(mmproj)

  override def onWrite(path: String, spark: SparkSession): Unit = {
    super.onWrite(path, spark)
    getModelIfNotSet.saveToFile(path)
  }

  /** Completes the batch of annotations.
    *
    * @param batchedAnnotations
    *   The single batch of annotations
    * @return
    *   Completed text sequences
    *
    * sentences that belong to the same original row !! (challenging)
    */
  override def batchAnnotate(
      batchedAnnotations: Seq[(Annotation, AnnotationImage)]): Seq[Seq[Annotation]] = {
    if (batchedAnnotations.nonEmpty) {

      // set parallel decoding to batch size
      val modelParams = getModelParameters.setNParallel(getBatchSize)
      val model: LlamaModel = getModelIfNotSet.getSession(modelParams)

      val (prompts, base64EncodedImages) = batchedAnnotations.unzip match {
        case (promptAnnotations, imageAnnotations) =>
          (
            promptAnnotations.map(_.result).toArray,
            imageAnnotations
              .map(imgAnno => ImageIOUtils.encodeImageBase64(imgAnno.result))
              .toArray)
      }

      val (completedTexts: Array[String], metadata: Map[String, String]) =
        try {
          (
            model.requestBatchImageCompletion(
              prompts,
              base64EncodedImages,
              getInferenceParameters),
            Map.empty)
        } catch {
          case e: LlamaException =>
            logger.error("Error in llama.cpp image batch completion", e)
            (Array.fill(prompts.length)(""), Map("llamacpp_exception" -> e.getMessage))
        }

      val result: Seq[Seq[Annotation]] =
        batchedAnnotations.zip(completedTexts).map {
          case ((textAnnotation: Annotation, imageAnnotation: AnnotationImage), text) =>
            val totalMetadata =
              textAnnotation.metadata ++ imageAnnotation.metadata ++ metadata
            Seq(new Annotation(outputAnnotatorType, 0, text.length - 1, text, totalMetadata))
        }
      result
    } else Seq(Seq.empty[Annotation])
  }
}

trait ReadablePretrainedAutoGGUFVisionModel
    extends ParamsAndFeaturesReadable[AutoGGUFVisionModel]
    with HasPretrained[AutoGGUFVisionModel] {
  override val defaultModelName: Some[String] = Some("llava_v1.5_7b_Q4_0_gguf")
  override val defaultLang: String = "en"

  /** Java compliant-overrides */
  override def pretrained(): AutoGGUFVisionModel = super.pretrained()

  override def pretrained(name: String): AutoGGUFVisionModel = super.pretrained(name)

  override def pretrained(name: String, lang: String): AutoGGUFVisionModel =
    super.pretrained(name, lang)

  override def pretrained(name: String, lang: String, remoteLoc: String): AutoGGUFVisionModel =
    super.pretrained(name, lang, remoteLoc)
}

trait ReadAutoGGUFVisionModel {
  this: ParamsAndFeaturesReadable[AutoGGUFVisionModel] =>

  def readModel(instance: AutoGGUFVisionModel, path: String, spark: SparkSession): Unit = {
    val model: GGUFWrapperMultiModal = GGUFWrapperMultiModal.readModel(path, spark)

    instance.setModelIfNotSet(spark, model)
  }

  addReader(readModel)

  def loadSavedModel(
      modelPath: String,
      mmprojPath: String,
      spark: SparkSession): AutoGGUFVisionModel = {
    // TODO potentially enable download from HF-URLS
    val localPathModel: String = ResourceHelper.copyToLocal(modelPath)
    val localPathMmproj: String = ResourceHelper.copyToLocal(mmprojPath)

    val annotatorModel = new AutoGGUFVisionModel()
    val wrapper = GGUFWrapperMultiModal.read(spark, localPathModel, localPathMmproj)

    annotatorModel
      .setModelIfNotSet(spark, wrapper)
      .setEngine(LlamaCPP.name)

    // TODO mmproj metadata necessary?
    val metadata = LlamaModel.getMetadataFromFile(localPathModel)
    if (metadata.nonEmpty) annotatorModel.setMetadata(metadata)
    annotatorModel
  }
}

/** This is the companion object of [[AutoGGUFVisionModel]]. Please refer to that class for the
  * documentation.
  */
object AutoGGUFVisionModel
    extends ReadablePretrainedAutoGGUFVisionModel
    with ReadAutoGGUFVisionModel