tasks.fileservice.s3.S3.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of tasks-core_2.13 Show documentation
tasks-core
The newest version!
// MIT License

// Copyright (c) 2018 Daniel Mateus Pires

// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:

// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.

// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
package tasks.fileservice.s3

import cats.effect._
import fs2.{Chunk, Pipe, Pull}
import software.amazon.awssdk.core.async.{
  AsyncRequestBody,
  AsyncResponseTransformer
}
import software.amazon.awssdk.services.s3.model._

import java.nio.ByteBuffer
import scala.collection.immutable.ArraySeq
import scala.collection.immutable.ArraySeq.unsafeWrapArray
import software.amazon.awssdk.services.s3.S3AsyncClient
import java.util.concurrent.CompletableFuture
import scala.jdk.CollectionConverters._
import software.amazon.awssdk.auth.credentials.DefaultCredentialsProvider
import software.amazon.awssdk.regions.providers.DefaultAwsRegionProviderChain

case class S3UploadResponse(etag: String, contentLength: Long)

object S3 {
  def makeAWSSDKClient(regionProfileName: Option[String]) = S3AsyncClient
    .builder()
    .credentialsProvider(DefaultCredentialsProvider.create())
    .region({
      val b = DefaultAwsRegionProviderChain.builder()
      regionProfileName
        .foldLeft(b)((a, b) => a.profileName(b))
        .build
        .getRegion()
    })
    .build()
}

/** Wrapper of AWS SDK's S3AsyncClient into fs2.Stream and cats.effect.IO
  *
  * @param s3
  */
class S3(val s3: S3AsyncClient) {
  private type PartId = Long
  private type PartLength = Long
  private type UploadId = String

  def io[J, A](fut: => CompletableFuture[A]): IO[A] =
    IO.fromCompletableFuture(IO.delay(fut))

  /** Deletes a file in a single request.
    */
  def delete(bucket: String, key: String): IO[Unit] =
    io(
      s3.deleteObject(
        DeleteObjectRequest
          .builder()
          .bucket(bucket)
          .key(key)
          .build()
      )
    ).void

  /** Uploads a file in a single request. Suitable for small files.
    *
    * For big files, consider using [[uploadFileMultipart]] instead.
    */
  def uploadFile(
      bucket: String,
      key: String,
      cannedAcl: List[String],
      serverSideEncryption: Option[String],
      grantFullControl: List[String]
  ): Pipe[IO, Byte, PutObjectResponse] =
    in =>
      fs2.Stream.eval {
        in.compile.toVector.flatMap { vs =>
          val bs = ByteBuffer.wrap(vs.toArray)
          val base = PutObjectRequest
            .builder()
            .bucket(bucket)
            .key(key)

          val builder = grantFullControl.foldLeft(
            serverSideEncryption.foldLeft(
              cannedAcl.foldLeft(base)((a, b) => a.acl(b))
            )((a, b) => a.serverSideEncryption(b))
          )((a, b) => a.grantFullControl(b))

          io(
            s3.putObject(
              builder
                .build(),
              AsyncRequestBody.fromByteBuffer(bs)
            )
          )
        }
      }

  /** Uploads a file in multiple parts of the specified @partSize per request.
    * Suitable for big files.
    *
    * It does so in constant memory. So at a given time, only the number of
    * bytes indicated by @partSize will be loaded in memory.
    *
    * Note: AWS S3 API does not support uploading empty files via multipart
    * upload. It does not gracefully respond on attempting to do this and
    * returns a `400` response with a generic error message. This function
    * accepts a boolean `uploadEmptyFile` (set to `false` by default) to
    * determine how to handle this scenario. If set to false (default) and no
    * data has passed through the stream, it will gracefully abort the
    * multi-part upload request. If set to true, and no data has passed through
    * the stream, an empty file will be uploaded on completion. An
    * `Option[ETag]` of `None` will be emitted on the stream if no file was
    * uploaded, else a `Some(ETag)` will be emitted. Alternatively, If you need
    * to create empty files, consider using consider using [[uploadFile]]
    * instead.
    *
    * For small files, consider using [[uploadFile]] instead.
    *
    * @param bucket
    *   the bucket name
    * @param key
    *   the target file key
    * @param partSize
    *   the part size indicated in MBs. It must be at least 5, as required by
    *   AWS.
    * @param uploadEmptyFiles
    *   whether to upload empty files or not, if no data has passed through the
    *   stream create an empty file default is false
    * @param multiPartConcurrency
    *   the number of concurrent parts to upload
    */
  def uploadFileMultipart(
      bucket: String,
      key: String,
      partSize: Int,
      multiPartConcurrency: Int,
      cannedAcl: List[String],
      serverSideEncryption: Option[String],
      grantFullControl: List[String]
  ): Pipe[IO, Byte, S3UploadResponse] = {
    val chunkSizeBytes = math.max(5, partSize) * 1048576

    def initiateMultipartUpload = {
      val base = CreateMultipartUploadRequest
        .builder()
        .bucket(bucket)
        .key(key)
      val builder = grantFullControl.foldLeft(
        serverSideEncryption.foldLeft(
          cannedAcl.foldLeft(base)((a, b) => a.acl(b))
        )((a, b) => a.serverSideEncryption(b))
      )((a, b) => a.grantFullControl(b))

      io(
        s3.createMultipartUpload(
          builder
            .build()
        )
      ).map(_.uploadId())
    }

    def uploadPart(
        uploadId: UploadId
    ): Pipe[
      IO,
      (Chunk[Byte], PartId),
      (UploadPartResponse, PartId, PartLength)
    ] =
      _.parEvalMap(multiPartConcurrency) { case (c, i) =>
        io(
          s3.uploadPart(
            UploadPartRequest
              .builder()
              .bucket(bucket)
              .key(key)
              .uploadId(uploadId)
              .partNumber(i.toInt)
              .contentLength(c.size.toLong)
              .build(),
            AsyncRequestBody.fromBytes(c.toArray)
          )
        ).map(r => (r, i, c.size.toLong))
      }

    def uploadEmptyFile = io(
      s3.putObject(
        PutObjectRequest.builder().bucket(bucket).key(key).build(),
        AsyncRequestBody.fromBytes(new Array[Byte](0))
      )
    )
    def completeUpload(
        uploadId: UploadId
    ): Pipe[IO, List[
      (UploadPartResponse, PartId, PartLength)
    ], S3UploadResponse] =
      _.evalMap {
        case Nil =>
          cancelUpload(uploadId).flatMap { _ =>
            uploadEmptyFile.map(r =>
              S3UploadResponse(etag = r.eTag(), contentLength = 0L)
            )

          }
        case tags =>
          val parts = tags.map { case (t, i, _) =>
            CompletedPart.builder().partNumber(i.toInt).eTag(t.eTag()).build()
          }.asJava
          io(
            s3.completeMultipartUpload(
              CompleteMultipartUploadRequest
                .builder()
                .bucket(bucket)
                .key(key)
                .uploadId(uploadId)
                .multipartUpload(
                  CompletedMultipartUpload.builder().parts(parts).build()
                )
                .build()
            )
          ).map { response =>
            S3UploadResponse(response.eTag(), tags.map(_._3).sum)
          }

      }

    def cancelUpload(uploadId: UploadId): IO[Unit] =
      io(
        s3.abortMultipartUpload(
          AbortMultipartUploadRequest
            .builder()
            .bucket(bucket)
            .key(key)
            .uploadId(uploadId)
            .build()
        )
      ).void

    in =>
      fs2.Stream
        .eval(initiateMultipartUpload)
        .flatMap { uploadId =>
          in.chunkN(chunkSizeBytes)
            .zip(fs2.Stream.iterate(1L)(_ + 1))
            .through(uploadPart(uploadId))
            .fold[List[(UploadPartResponse, PartId, PartLength)]](List.empty)(
              _ :+ _
            )
            .through(completeUpload(uploadId))
            .handleErrorWith(ex =>
              fs2.Stream.eval(cancelUpload(uploadId) >> Sync[IO].raiseError(ex))
            )
        }

  }

  def getObjectMetadata(
      bucket: String,
      key: String
  ): IO[Option[HeadObjectResponse]] =
    io(
      s3.headObject(
        HeadObjectRequest
          .builder()
          .bucket(bucket)
          .key(key)
          .build()
      )
    ).map(Option(_))
      .handleErrorWith(_ match {
        case _: NoSuchKeyException => IO.pure(Option.empty[HeadObjectResponse])
        case e: Throwable          => IO.raiseError(e)
      })

  /** Reads a file in a single request. Suitable for small files.
    *
    * For big files, consider using [[readFileMultipart]] instead.
    */
  def readFile(bucket: String, key: String): fs2.Stream[IO, Byte] =
    fs2.Stream
      .eval(
        io(
          s3.getObject(
            GetObjectRequest
              .builder()
              .bucket(bucket)
              .key(key)
              .build(),
            AsyncResponseTransformer.toBytes[GetObjectResponse]
          )
        )
      )
      .flatMap(r =>
        fs2.Stream.chunk(Chunk(ArraySeq.unsafeWrapArray(r.asByteArray): _*))
      )

  /** Reads a file in multiple parts of the specified @partSize per request.
    * Suitable for big files.
    *
    * It does so in constant memory. So at a given time, only the number of
    * bytes indicated by @partSize will be loaded in memory.
    *
    * For small files, consider using [[readFile]] instead.
    *
    * @param partSize
    *   in megabytes
    */
  def readFileMultipart(
      bucket: String,
      key: String,
      partSize: Int
  ): fs2.Stream[IO, Byte] = {
    val chunkSizeBytes = partSize * 1048576

    // Range must be in the form "bytes=0-500" -> https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35
    def go(offset: Long): Pull[IO, Byte, Unit] =
      fs2.Stream
        .eval {
          io(
            s3.getObject(
              GetObjectRequest
                .builder()
                .range(s"bytes=$offset-${offset + chunkSizeBytes}")
                .bucket(bucket)
                .key(key)
                .build(),
              AsyncResponseTransformer.toBytes[GetObjectResponse]
            )
          )
        }
        .pull
        .last
        .flatMap {
          case Some(resp) =>
            Pull.eval {
              IO.interruptible {
                val bs = resp.asByteArray()
                val len = bs.length
                if (len < 0) None else Some(Chunk(unsafeWrapArray(bs): _*))
              }
            }
          case None =>
            Pull.eval(IO.pure(None))
        }
        .flatMap {
          case Some(o) =>
            if (o.size < chunkSizeBytes) Pull.output(o)
            else Pull.output(o) >> go(offset + o.size)
          case None => Pull.done
        }

    go(0).stream
  }

}