com.spotify.scio.elasticsearch.package.scala Maven / Gradle / Ivy
/*
* Copyright 2022 Spotify AB.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.spotify.scio
import co.elastic.clients.elasticsearch.core.bulk.BulkOperation
import co.elastic.clients.json.jackson.JacksonJsonpMapper
import co.elastic.clients.json.JsonpMapper
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule
import com.fasterxml.jackson.module.scala.DefaultScalaModule
import com.spotify.scio.elasticsearch.ElasticsearchIO.{RetryConfig, WriteParam}
import com.spotify.scio.io.ClosedTap
import com.spotify.scio.values.SCollection
import org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Write.BulkExecutionException
import org.apache.http.HttpHost
import org.joda.time.Duration
/**
* Main package for Elasticsearch APIs. Import all.
*
* {{{
* import com.spotify.scio.elasticsearch._
* }}}
*/
package object elasticsearch extends CoderInstances {
def defaultMapper(): JsonpMapper = {
// Use jackson for user json serialization, add scala and java.time support
val mapper = new JacksonJsonpMapper()
mapper.objectMapper().registerModule(DefaultScalaModule).registerModule(new JavaTimeModule())
mapper
}
final case class ElasticsearchOptions(
nodes: Seq[HttpHost],
usernameAndPassword: Option[(String, String)] = None,
mapperFactory: () => JsonpMapper = defaultMapper
)
implicit class ElasticsearchSCollection[T](@transient private val self: SCollection[T])
extends AnyVal {
/**
* Save this SCollection into Elasticsearch.
*
* @param esOptions
* Elasticsearch options
* @param flushInterval
* delays to Elasticsearch writes for rate limiting purpose
* @param f
* function to transform arbitrary type T to Elasticsearch `DocWriteRequest`
* @param numOfShards
* number of parallel writes to be performed, recommended setting is the number of pipeline
* workers
* @param errorFn
* function to handle error when performing Elasticsearch bulk writes
*/
def saveAsElasticsearch(
esOptions: ElasticsearchOptions,
flushInterval: Duration = WriteParam.DefaultFlushInterval,
numOfShards: Long = WriteParam.DefaultNumShards,
maxBulkRequestOperations: Int = WriteParam.DefaultMaxBulkRequestOperations,
maxBulkRequestBytes: Long = WriteParam.DefaultMaxBulkRequestBytes,
errorFn: BulkExecutionException => Unit = WriteParam.DefaultErrorFn,
retry: RetryConfig = WriteParam.DefaultRetryConfig
)(f: T => Iterable[BulkOperation]): ClosedTap[Nothing] = {
val param = WriteParam(
f,
errorFn,
flushInterval,
numOfShards,
maxBulkRequestOperations,
maxBulkRequestBytes,
retry
)
self.write(ElasticsearchIO[T](esOptions))(param)
}
}
}