
com.intel.analytics.bigdl.nn.TransformerOperation.scala Maven / Gradle / Ivy
The newest version!
/*
* Copyright 2016 The BigDL Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.intel.analytics.bigdl.nn
import breeze.linalg.*
import breeze.numerics.exp
import com.intel.analytics.bigdl._
import com.intel.analytics.bigdl.nn.abstractnn.{AbstractModule, TensorModule}
import com.intel.analytics.bigdl.optim.Regularizer
import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric
import com.intel.analytics.bigdl.tensor.{Storage, Tensor}
import com.intel.analytics.bigdl.utils.{EngineType, T}
import scala.reflect.ClassTag
private[nn] object TransformerOperation {
def dense[T: ClassTag](
inputSize: Int,
outputSize: Int,
bias: Boolean = true,
activation: TensorModule[T] = null,
wRegularizer: Regularizer[T] = null,
bRegularizer: Regularizer[T] = null,
name: String = "")(implicit ev: TensorNumeric[T]): Module[T] = {
val seq = new Sequential[T]()
val layer = Linear[T](
inputSize = inputSize,
outputSize = outputSize,
withBias = bias,
wRegularizer = wRegularizer,
bRegularizer = bRegularizer)
layer.setInitMethod(weightInitMethod = Xavier, biasInitMethod = Zeros)
if (name != "") layer.setName(name)
seq.add(TimeDistributed[T](layer))
if (activation != null) seq.add(activation)
seq
}
def softMax[T: ClassTag]()(implicit ev: TensorNumeric[T]): Module[T] = {
val layer = SoftMax[T]()
val model = Sequential[T]()
model.add(Transpose[T](Array((2, 4))))
model.add(layer)
model.add(Transpose[T](Array((2, 4))))
model.asInstanceOf[AbstractModule[Tensor[T], Tensor[T], T]]
}
/**
* Calculate bias tensor from padding values in tensor.
* Bias tensor that is added to the pre-softmax multi-headed attention logits,
* which has shape [batch_size, num_heads, length, length]. The tensor is zero at
* non-padding locations, and -1e9 (negative infinity) at padding locations.
* Args: x: int tensor with shape [batch_size, length]
* Returns: Attention bias tensor of shape [batch_size, 1, 1, length].
* @param input
* @tparam T
* @return
*/
def getPaddingBias[T: ClassTag](input: Tensor[T])(implicit ev: TensorNumeric[T]): Tensor[T] = {
val res = getPadding[T](input).mul(ev.fromType(-1e9))
res.addSingletonDimension(res, 2)
res.addSingletonDimension(res, 3)
}
/**
* Return float tensor representing the padding values in x.
* Args:
* x: int tensor with any shape
* padding_value: int value that
* Returns:float tensor with same shape as x containing values 0 or 1.
* 0 -> non-padding, 1 -> padding
*/
def getPadding[T: ClassTag](input: Tensor[T], paddingValue: Float = 0.0f)
(implicit ev: TensorNumeric[T]): Tensor[T] = {
input.apply1(e => {if (e == paddingValue) ev.one else ev.zero})
}
// Shift the second dimension of x right by one.
def shiftRight3D[T: ClassTag](input: Tensor[T], output: Tensor[T])
(implicit ev: TensorNumeric[T]): Tensor[T] = {
output.resizeAs(input).zero()
val index = input.size(2)
output.narrow(2, 2, index - 1).copy(input.narrow(2, 1, index - 1))
output
}
def initRangeTensor[T: ClassTag](length: Int, rangeBuffer: Tensor[T])
(implicit ev: TensorNumeric[T]): Unit = {
rangeBuffer.resize(Array(length))
val arr = rangeBuffer.storage().array()
for (i <- 0 to (length - 1)) {
arr(i) = ev.fromType(i)
}
}
/**
* Args:length: Sequence length.
* channels: Size of the hidden
* minTimescale: Minimum scale that will be applied at each position
* maxTimescale: Maximum scale that will be applied at each position
* Returns: Tensor with shape [length, hidden_size]
*/
def getPositionEncode[T: ClassTag](
length: Int,
channels: Int,
minTimescale : Float = 1.0f,
maxTimescale: Float = 1.0e4f,
rangeBuffer: Tensor[T],
outBuffer: Tensor[T])(implicit ev: TensorNumeric[T]): Tensor[T] = {
// get_timing_signal_1d, return (1, length, channels)
val numTimescales = channels / 2
val logTimescale = math.log(maxTimescale / minTimescale) /
math.max(numTimescales - 1, 1)
// tf.range(num_timescales)
val invTensor = Tensor[T](1, numTimescales)
val inv_timescales = invTensor.storage().array()
val offset = invTensor.storageOffset() - 1
var i = 0
while (i < numTimescales) {
inv_timescales(i + offset) = ev.fromType(minTimescale * math.exp(i * - logTimescale))
i += 1
}
val outSin = outBuffer.narrow(2, 1, numTimescales)
outSin.addmm(ev.zero, ev.one, rangeBuffer.resize(length, 1), invTensor)
val outCos = outBuffer.narrow(2, numTimescales + 1, numTimescales).copy(outSin)
outSin.apply1(e => ev.fromType(math.sin(ev.toType[Float](e))))
outCos.apply1(e => ev.fromType(math.cos(ev.toType[Float](e))))
outBuffer
}
private val maskValue = -1e9
/**
* Create an bias tensor to be added to attention logits.
* Returns tensor with shape (1, 1, length, length)
* @param length
* @tparam T
* @return
*/
def attentionBiasLowerTriangle[T: ClassTag](
length: Int, output: Tensor[T])(implicit ev: TensorNumeric[T]): Tensor[T] = {
val arr = output.storage().array()
for (i <- 0 to (length - 1)) {
var j = length - 1
while (j > i) {
// reminder: here not 1
arr(i * length + j) = ev.fromType(maskValue)
j -= 1
}
}
output.resize(Array(1, 1, length, length))
}
}
sealed trait TransformerType
case object Translation extends TransformerType
case object LanguageModel extends TransformerType
© 2015 - 2025 Weber Informatics LLC | Privacy Policy