All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.intel.analytics.bigdl.dataset.text.TextToLabeledSentence.scala Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2016 The BigDL Authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.intel.analytics.bigdl.dataset.text

import com.intel.analytics.bigdl.dataset.Transformer
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric

import scala.collection.Iterator
import scala.reflect.ClassTag

object TextToLabeledSentence {
  def apply[T: ClassTag](dictionary: Dictionary)
           (implicit ev: TensorNumeric[T])
  : TextToLabeledSentence[T] =
    new TextToLabeledSentence[T](dictionary)
  def apply[T: ClassTag](numSteps: Int)(implicit ev: TensorNumeric[T])
  : TextToSentenceWithSteps[T] = new TextToSentenceWithSteps[T](numSteps)
}

/**
 * Transform a string of sentence to LabeledSentence.
 * e.g. ["I", "love", "Intel"] => [0, 1, 2]
 *      data:  [0, 1]
 *      label: [1, 2]
 *
 * The input Array[String] should be a tokenized sentence.
 * e.g. I love Intel => ["I", "love", "Intel"]
 * @param dictionary
 * @param ev
 * @tparam T
 */
class TextToLabeledSentence[T: ClassTag](dictionary: Dictionary)
  (implicit ev: TensorNumeric[T])
  extends Transformer[Array[String], LabeledSentence[T]] {
  private val buffer = new LabeledSentence[T]()

  override def apply(prev: Iterator[Array[String]]): Iterator[LabeledSentence[T]] = {
    prev.map(sentence => {
      val indexes = sentence.map(x =>
        ev.fromType[Int](dictionary.getIndex(x)))
      val nWords = indexes.length - 1
      val data = indexes.take(nWords)
      val label = indexes.drop(1)
      buffer.copy(data, label)
    })
  }
}

/**
 * Transform a sequence of integers to LabeledSentence.
 * e.g. input = [0, 1, 2, 3, 4, 5, 6, ..]
 *      numSteps = 3
 *
 *      xbuffer = [0, 1, 2]
 *      ybuffer = [1, 2, 3]
 *
 * next:
 *      xbuffer = [3, 4, 5]
 *      ybuffer = [4, 5, 6]
 * @param numSteps
 * @param ev$1
 * @param ev
 * @tparam T
 */
private[bigdl] class TextToSentenceWithSteps[T: ClassTag](numSteps: Int)
  (implicit ev: TensorNumeric[T])
  extends Transformer[Array[T], LabeledSentence[T]] {
  val xbuffer = new Array[T](numSteps)
  val ybuffer = new Array[T](numSteps)
  val buffer = new LabeledSentence[T]()

  override def apply(prev: Iterator[Array[T]]): Iterator[LabeledSentence[T]] = {
    prev.map(sentence => {
      require(sentence.length >= numSteps + 1,
        "input sentence length should be numSteps + 1, " +
          s"sentence.length = ${sentence.length}, numSteps = ${numSteps}")
      Array.copy(sentence, 0, xbuffer, 0, numSteps)
      Array.copy(sentence, 1, ybuffer, 0, numSteps)

      buffer.copy(xbuffer, ybuffer)
      buffer
    })
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy