All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.spotify.scio.elasticsearch.ElasticsearchIO.scala Maven / Gradle / Ivy

/*
 * Copyright 2022 Spotify AB.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package com.spotify.scio.elasticsearch

import co.elastic.clients.elasticsearch.core.bulk.BulkOperation
import com.spotify.scio.ScioContext
import com.spotify.scio.io._
import com.spotify.scio.values.SCollection
import org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Write.BulkExecutionException
import org.apache.beam.sdk.io.{elasticsearch => beam}
import org.apache.beam.sdk.transforms.SerializableFunction
import org.apache.http.auth.UsernamePasswordCredentials
import org.joda.time.Duration

import java.lang.{Iterable => JIterable}
import scala.jdk.CollectionConverters._
import scala.util.chaining._

final case class ElasticsearchIO[T](esOptions: ElasticsearchOptions) extends ScioIO[T] {
  override type ReadP = Nothing
  override type WriteP = ElasticsearchIO.WriteParam[T]
  override val tapT: TapT.Aux[T, Nothing] = EmptyTapOf[T]

  // remove the mapperFactory from the test id
  override def testId: String =
    s"ElasticsearchIO(${esOptions.nodes}, ${esOptions.usernameAndPassword})"

  override protected def read(sc: ScioContext, params: ReadP): SCollection[T] =
    throw new UnsupportedOperationException("Can't read from Elasticsearch")

  /** Save this SCollection into Elasticsearch. */
  override protected def write(data: SCollection[T], params: WriteP): Tap[Nothing] = {
    val shards = if (params.numOfShards >= 0) params.numOfShards else esOptions.nodes.size.toLong
    val credentials = esOptions.usernameAndPassword.map { case (username, password) =>
      new UsernamePasswordCredentials(username, password)
    }

    val write = beam.ElasticsearchIO.Write
      .withNodes(esOptions.nodes.toArray)
      .withFunction(new SerializableFunction[T, JIterable[BulkOperation]]() {
        override def apply(t: T): JIterable[BulkOperation] = params.f(t).asJava
      })
      .withFlushInterval(params.flushInterval)
      .withNumOfShard(shards)
      .withMaxBulkRequestOperations(params.maxBulkRequestOperations)
      .withMaxBulkRequestBytes(params.maxBulkRequestBytes)
      .withMaxRetries(params.retry.maxRetries)
      .withRetryPause(params.retry.retryPause)
      .withError((t: BulkExecutionException) => params.errorFn(t))
      .pipe(w => credentials.map(w.withCredentials).getOrElse(w))
      .withMapperFactory(() => esOptions.mapperFactory.apply())

    data.applyInternal(write)
    EmptyTap
  }

  override def tap(params: ReadP): Tap[Nothing] =
    EmptyTap
}

object ElasticsearchIO {
  object WriteParam {
    val DefaultErrorFn: BulkExecutionException => Unit = m => throw m
    val DefaultFlushInterval: Duration = Duration.standardSeconds(1)
    val DefaultNumShards: Long = -1L
    val DefaultMaxBulkRequestOperations: Int = 3000
    val DefaultMaxBulkRequestBytes: Long = 5L * 1024L * 1024L
    val DefaultMaxRetries: Int = 3
    val DefaultRetryPause: Duration = Duration.millis(35000)
    val DefaultRetryConfig: RetryConfig = RetryConfig(
      maxRetries = WriteParam.DefaultMaxRetries,
      retryPause = WriteParam.DefaultRetryPause
    )
  }
  final case class WriteParam[T] private (
    f: T => Iterable[BulkOperation],
    errorFn: BulkExecutionException => Unit = WriteParam.DefaultErrorFn,
    flushInterval: Duration = WriteParam.DefaultFlushInterval,
    numOfShards: Long = WriteParam.DefaultNumShards,
    maxBulkRequestOperations: Int = WriteParam.DefaultMaxBulkRequestOperations,
    maxBulkRequestBytes: Long = WriteParam.DefaultMaxBulkRequestBytes,
    retry: RetryConfig = WriteParam.DefaultRetryConfig
  )

  final case class RetryConfig(maxRetries: Int, retryPause: Duration)
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy