com.johnsnowlabs.ml.ai.util.PrepareEmbeddings.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of spark-nlp_2.12 Show documentation
spark-nlp
The newest version!
/*
 * Copyright 2017-2022 John Snow Labs
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.johnsnowlabs.ml.ai.util

import com.johnsnowlabs.ml.tensorflow.TensorResources
import com.johnsnowlabs.nlp.annotators.common._
import org.tensorflow.Tensor
import org.tensorflow.ndarray.buffer.IntDataBuffer

private[johnsnowlabs] object PrepareEmbeddings {

  /** prepare batches of piece IDs while padding shorter sequences to the longest sequence length
    * and trim sequences longer than maxSequenceLength
    *
    * @param sentences
    *   batches of WordpieceTokenizedSentence
    * @param maxSequenceLength
    *   provided maximum allowed sequence length
    * @param sentenceStartTokenId
    *   id for token piece at the beginning of sequence
    * @param sentenceEndTokenId
    *   id for token piece at the end of sequence
    * @param sentencePadTokenId
    *   id for token piece used for padding
    * @return
    */
  def prepareBatchInputsWithPadding(
      sentences: Seq[(WordpieceTokenizedSentence, Int)],
      maxSequenceLength: Int,
      sentenceStartTokenId: Int,
      sentenceEndTokenId: Int,
      sentencePadTokenId: Int = 0): Seq[Array[Int]] = {
    val maxSentenceLength =
      Array(
        maxSequenceLength - 2,
        sentences.map { case (wpTokSentence, _) =>
          wpTokSentence.tokens.length
        }.max).min

    sentences
      .map { case (wpTokSentence, _) =>
        val tokenPieceIds = wpTokSentence.tokens.map(t => t.pieceId)
        val padding = Array.fill(maxSentenceLength - tokenPieceIds.length)(sentencePadTokenId)

        Array(sentenceStartTokenId) ++ tokenPieceIds.take(maxSentenceLength) ++ Array(
          sentenceEndTokenId) ++ padding
      }
  }

  def prepareOvLongBatchTensors(
      batch: Seq[Array[Int]],
      maxSentenceLength: Int,
      batchLength: Int,
      sentencePadTokenId: Int = 0): (org.intel.openvino.Tensor, org.intel.openvino.Tensor) = {
    val shape = Array(batchLength, maxSentenceLength)
    val tokenTensors =
      new org.intel.openvino.Tensor(shape, batch.flatMap(x => x.map(xx => xx.toLong)).toArray)
    val maskTensors = new org.intel.openvino.Tensor(
      shape,
      batch
        .flatMap(sentence => sentence.map(x => if (x == sentencePadTokenId) 0L else 1L))
        .toArray)

    (tokenTensors, maskTensors)
  }

  def prepareOvIntBatchTensorsWithSegment(
      batch: Seq[Array[Int]],
      maxSentenceLength: Int,
      batchLength: Int,
      sentencePadTokenId: Int = 0,
      shape: Option[Array[Int]] = None)
      : (org.intel.openvino.Tensor, org.intel.openvino.Tensor, org.intel.openvino.Tensor) = {
    val tensorsShape = if (shape.isDefined) shape.get else Array(batchLength, maxSentenceLength)
    val inputIds = batch.flatten.toArray
    val attentionMask = batch
      .flatMap(sentence => sentence.map(x => if (x == sentencePadTokenId) 0 else 1))
      .toArray

    val tokenTensors = new org.intel.openvino.Tensor(tensorsShape, inputIds)
    val maskTensors = new org.intel.openvino.Tensor(tensorsShape, attentionMask)

    val segmentTensors =
      new org.intel.openvino.Tensor(tensorsShape, Array.fill(batchLength * maxSentenceLength)(0))

    (tokenTensors, maskTensors, segmentTensors)
  }

  def prepareOvLongBatchTensorsWithSegment(
      batch: Seq[Array[Int]],
      maxSentenceLength: Int,
      batchLength: Int,
      sentencePadTokenId: Int = 0,
      shape: Option[Array[Int]] = None)
      : (org.intel.openvino.Tensor, org.intel.openvino.Tensor, org.intel.openvino.Tensor) = {
    val tensorsShape = if (shape.isDefined) shape.get else Array(batchLength, maxSentenceLength)
    val inputIds = batch.flatMap(x => x.map(xx => xx.toLong)).toArray
    val attentionMask = batch
      .flatMap(sentence => sentence.map(x => if (x == sentencePadTokenId) 0L else 1L))
      .toArray

    val tokenTensors = new org.intel.openvino.Tensor(tensorsShape, inputIds)
    val maskTensors = new org.intel.openvino.Tensor(tensorsShape, attentionMask)

    val segmentTensors =
      new org.intel.openvino.Tensor(tensorsShape, Array.fill(batchLength * maxSentenceLength)(0L))

    (tokenTensors, maskTensors, segmentTensors)
  }

  def prepareBatchTensors(
      tensors: TensorResources,
      batch: Seq[Array[Int]],
      maxSentenceLength: Int,
      batchLength: Int,
      sentencePadTokenId: Int = 0): (Tensor, Tensor) = {

    val tokenBuffers: IntDataBuffer = tensors.createIntBuffer(batchLength * maxSentenceLength)
    val maskBuffers: IntDataBuffer = tensors.createIntBuffer(batchLength * maxSentenceLength)

    batch.zipWithIndex
      .foreach { case (sentence, idx) =>
        val offset = idx * maxSentenceLength
        tokenBuffers.offset(offset).write(sentence)
        maskBuffers.offset(offset).write(sentence.map(x => if (x == sentencePadTokenId) 0 else 1))
      }

    // [nb of encoded sentences , maxSentenceLength]
    val shape = Array(batch.length.toLong, maxSentenceLength)

    val tokenTensors = tensors.createIntBufferTensor(shape, tokenBuffers)
    val maskTensors = tensors.createIntBufferTensor(shape, maskBuffers)

    (tokenTensors, maskTensors)

  }

  def prepareBatchTensorsWithSegment(
      tensors: TensorResources,
      batch: Seq[Array[Int]],
      maxSentenceLength: Int,
      batchLength: Int,
      sentencePadTokenId: Int = 0): (Tensor, Tensor, Tensor) = {

    val tokenBuffers: IntDataBuffer = tensors.createIntBuffer(batchLength * maxSentenceLength)
    val maskBuffers: IntDataBuffer = tensors.createIntBuffer(batchLength * maxSentenceLength)
    val segmentBuffers: IntDataBuffer = tensors.createIntBuffer(batchLength * maxSentenceLength)

    batch.zipWithIndex
      .foreach { case (sentence, idx) =>
        val offset = idx * maxSentenceLength
        tokenBuffers.offset(offset).write(sentence)
        maskBuffers.offset(offset).write(sentence.map(x => if (x == sentencePadTokenId) 0 else 1))
        segmentBuffers.offset(offset).write(Array.fill(maxSentenceLength)(0))
      }

    // [nb of encoded sentences , maxSentenceLength]
    val shape = Array(batch.length.toLong, maxSentenceLength)

    val tokenTensors = tensors.createIntBufferTensor(shape, tokenBuffers)
    val maskTensors = tensors.createIntBufferTensor(shape, maskBuffers)
    val segmentTensors = tensors.createIntBufferTensor(shape, segmentBuffers)

    (tokenTensors, maskTensors, segmentTensors)

  }

  def prepareBatchWordEmbeddings(
      batch: Seq[Array[Int]],
      embeddings: Array[Float],
      maxSentenceLength: Int,
      batchLength: Int): Seq[Array[Array[Float]]] = {
    val dim = embeddings.length / (batchLength * maxSentenceLength)
    val batchEmbeddings: Array[Array[Array[Float]]] =
      embeddings.grouped(dim).toArray.grouped(maxSentenceLength).toArray

    val emptyVector = Array.fill(dim)(0f)

    batch.zip(batchEmbeddings).map { case (ids, embeddings) =>
      if (ids.length > embeddings.length) {
        embeddings.take(embeddings.length - 1) ++
          Array.fill(embeddings.length - ids.length)(emptyVector) ++
          Array(embeddings.last)
      } else {
        embeddings
      }
    }
  }

}