com.johnsnowlabs.ml.tensorflow.NerDatasetEncoder.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of spark-nlp_2.12 Show documentation
spark-nlp
The newest version!
/*
 * Copyright 2017-2022 John Snow Labs
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.johnsnowlabs.ml.tensorflow

import com.johnsnowlabs.nlp.annotators.common.WordpieceEmbeddingsSentence

import scala.collection.Map

class NerDatasetEncoder(val params: DatasetEncoderParams) extends Serializable {

  private val nonDefaultTags = params.tags
    .filter(_ != params.defaultTag)
    .zipWithIndex
    .map(p => (p._1, p._2 + 1))
    .toMap

  val tag2Id: Map[String, Int] = Map(params.defaultTag -> 0) ++ nonDefaultTags
  val tags: Array[String] = tag2Id
    .map(p => (p._2, p._1))
    .toArray
    .sortBy(p => p._1)
    .map(p => p._2)

  val chars: Array[Char] = params.chars.toArray

  val char2Id: Map[Char, Int] = params.chars.zip(1 to params.chars.length).toMap

  def getOrElse[T](source: Array[T], i: Int, value: => T): T = {
    if (i < source.length)
      source(i)
    else
      value
  }

  def encodeInputData(sentences: Array[WordpieceEmbeddingsSentence]): NerBatch = {

    val batchSize = sentences.length

    if (batchSize == 0)
      return NerBatch.empty

    val sentenceLengths = sentences.map(s => s.tokens.length)
    val maxSentenceLength = sentenceLengths.max

    if (maxSentenceLength == 0)
      return NerBatch.empty

    val wordLengths = sentences.map { sentence =>
      val lengths = sentence.tokens.map(word => word.wordpiece.length)
      Range(0, maxSentenceLength).map { idx =>
        getOrElse(lengths, idx, 0)
      }.toArray
    }

    assert(wordLengths.flatten.nonEmpty, "")
    if (wordLengths.flatten.isEmpty) {
      return NerBatch.empty
    }

    val maxWordLength = wordLengths.flatten.max

    val wordEmbeddings =
      Range(0, batchSize).map { i =>
        val sentence = sentences(i)
        Range(0, maxSentenceLength).map { j =>
          if (j < sentence.tokens.length)
            sentence.tokens(j).embeddings
          else
            params.emptyEmbeddings

        }.toArray
      }.toArray

    val charIds =
      Range(0, batchSize).map { i =>
        val sentence = sentences(i)
        Range(0, maxSentenceLength).map { j =>
          val word = (if (j < sentence.tokens.length)
                        sentence.tokens(j).wordpiece
                      else
                        "").toCharArray

          Range(0, maxWordLength).map { k =>
            val char = getOrElse(word, k, Char.MinValue)
            char2Id.getOrElse(char, 0)
          }.toArray
        }.toArray
      }.toArray

    val isWordStart = sentences.map { sentence =>
      Range(0, maxSentenceLength).map { j =>
        if (j < sentence.tokens.length)
          sentence.tokens(j).isWordStart
        else
          false
      }.toArray
    }

    new NerBatch(
      wordEmbeddings,
      charIds,
      wordLengths,
      sentenceLengths,
      maxSentenceLength,
      isWordStart)
  }

  /** Converts Tag names to Identifiers
    *
    * @param tags
    *   batches of labels/classes for each sentence/document
    * @return
    *   batches of tag ids for each sentence/document
    */
  def encodeTags(tags: Array[Array[String]]): Array[Array[Int]] = {
    val batchSize = tags.length
    val maxSentence = tags.map(t => t.length).max

    (0 until batchSize).map { i =>
      (0 until maxSentence).map { j =>
        val tag = getOrElse(tags(i), j, params.defaultTag)
        tag2Id.getOrElse(tag, 0)
      }.toArray
    }.toArray
  }

  /** Converts Tag Identifiers to Source Names
    *
    * @param tagIds
    *   Tag Ids encoded for Tensorflow Model.
    * @return
    *   Tag names
    */
  def decodeOutputData(tagIds: Array[Int]): Array[String] = {
    tagIds.map(id => getOrElse(tags, id, params.defaultTag))
  }

  /** Converts Tensorflow tags output to 2-dimensional Array with shape: (Batch, Sentence Length).
    *
    * @param predictedTags
    *   2-dimensional tensor in plain array
    * @param allTags
    *   All original tags
    * @param sentenceLength
    *   Every sentence length (number of words).
    * @return
    *   List of tags for each sentence
    */
  def convertBatchTags(
      predictedTags: Array[String],
      allTags: Array[String],
      sentenceLength: Array[Int],
      prob: Option[Seq[Array[Float]]],
      includeAllConfidenceScores: Boolean)
      : Array[Array[(String, Option[Array[Map[String, String]]])]] = {

    val sentences = sentenceLength.length
    val maxSentenceLength = predictedTags.length / sentences

    Range(0, sentences).map { i =>
      Range(0, sentenceLength(i)).map { j =>
        {
          val index = i * maxSentenceLength + j
          val metaWithProb: Option[Array[Map[String, String]]] = if (prob.isDefined) {
            if (includeAllConfidenceScores) {
              Some(
                allTags.zipWithIndex
                  .map { case (t, i) =>
                    Map(
                      t -> prob
                        .map(_(index))
                        .getOrElse(Array.empty[String])
                        .lift(i)
                        .getOrElse(0.0f)
                        .toString)
                  })
            } else {
              Some(
                Array(
                  Map(
                    "confidence" -> prob
                      .map(_(index))
                      .getOrElse(Array.empty[String])
                      .lift(0)
                      .getOrElse(0.0f)
                      .toString)))
            }
          } else None

          (predictedTags(index), metaWithProb)
        }
      }.toArray
    }.toArray
  }
}

/** Batch that contains data in Tensorflow input format.
  *
  * @param wordEmbeddings
  *   Word vector representation. Shape: Batch x Max Sentence Length x Embeddings Dim
  * @param charIds
  *   Char ids for every word in every sentence. Shape: Batch x Max Sentence Length x Max Word
  *   length
  * @param wordLengths
  *   Word Length of every sentence. Shape: Batch x Max Sentence Length
  * @param sentenceLengths
  *   Length of every batch sentence. Shape: Batch
  * @param maxLength
  *   Max length of sentence
  * @param isWordStart
  *   Is current wordpiece is token start? Shape: Batch x Max Sentence Length
  */
class NerBatch(
    val wordEmbeddings: Array[Array[Array[Float]]],
    val charIds: Array[Array[Array[Int]]],
    val wordLengths: Array[Array[Int]],
    val sentenceLengths: Array[Int],
    val maxLength: Int,
    val isWordStart: Array[Array[Boolean]]) {
  def batchSize: Int = wordEmbeddings.length
}

object NerBatch {
  def empty = new NerBatch(Array.empty, Array.empty, Array.empty, Array.empty, 0, Array.empty)
}

/** @param tags
  *   list of unique tags
  * @param chars
  *   list of unique characters
  * @param emptyVector
  *   list of embeddings
  * @param embeddingsDim
  *   dimension of embeddings
  * @param defaultTag
  *   the default tag
  */
case class DatasetEncoderParams(
    tags: List[String],
    chars: List[Char],
    emptyVector: List[Float],
    embeddingsDim: Int,
    defaultTag: String = "O") {
  val emptyEmbeddings: Array[Float] = emptyVector.toArray
}