All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.lucidworks.spark.example.query.QueryBenchmark.scala Maven / Gradle / Ivy

package com.lucidworks.spark.example.query

import com.lucidworks.spark.SparkApp
import com.lucidworks.spark.rdd.SelectSolrRDD
import com.lucidworks.spark.util.SolrSupport
import org.apache.commons.cli.{CommandLine, Option}
import org.apache.solr.client.solrj.SolrQuery
import org.apache.solr.client.solrj.request.CollectionAdminRequest
import org.apache.spark.{SparkConf, SparkContext}

class QueryBenchmark extends SparkApp.RDDProcessor {
  def getName: String = "query-solr-benchmark"

  def getOptions: Array[Option] = {
    Array(
      Option.builder().longOpt("query").hasArg.required(false).desc("URL encoded Solr query to send to Solr, default is *:* (all docs)").build,
      Option.builder().longOpt("rows").hasArg.required(false).desc("Number of rows to fetch at once, default is 1000").build,
      Option.builder().longOpt("splitsPerShard").hasArg.required(false).desc("Number of splits per shard, default is 3").build,
      Option.builder().longOpt("splitField").hasArg.required(false).desc("Name of an indexed numeric field (preferably long type) used to split a shard, default is _version_").build,
      Option.builder().longOpt("fields").hasArg.required(false).desc("Comma-delimited list of fields to be returned from the query; default is all fields").build
    )
  }

  def run(conf: SparkConf, cli: CommandLine): Int = {

    val zkHost = cli.getOptionValue("zkHost", "localhost:9983")
    val collection = cli.getOptionValue("collection", "collection1")
    val queryStr = cli.getOptionValue("query", "*:*")
    val rows = cli.getOptionValue("rows", "1000").toInt
    val splitsPerShard = cli.getOptionValue("splitsPerShard", "3").toInt
    val splitField = cli.getOptionValue("splitField", "_version_")

    val sc = new SparkContext(conf)

    val solrQuery: SolrQuery = new SolrQuery(queryStr)

    val fields = cli.getOptionValue("fields", "")
    if (!fields.isEmpty)
      fields.split(",").foreach(solrQuery.addField)

    solrQuery.addSort(new SolrQuery.SortClause("id", "asc"))
    solrQuery.setRows(rows)

    val solrRDD: SelectSolrRDD = new SelectSolrRDD(zkHost, collection, sc)

    var startMs: Long = System.currentTimeMillis

    var count = solrRDD.query(solrQuery).splitField(splitField).splitsPerShard(splitsPerShard).count()

    var tookMs: Long = System.currentTimeMillis - startMs
    println(s"\nTook $tookMs ms read $count docs using queryShards with $splitsPerShard splits")

    // IMPORTANT: reload the collection to flush caches
    println(s"\nReloading collection $collection to flush caches!\n")
    val cloudSolrClient = SolrSupport.getCachedCloudClient(zkHost)
    val req = CollectionAdminRequest.reloadCollection(collection)
    cloudSolrClient.request(req)

    startMs = System.currentTimeMillis

    count = solrRDD.query(solrQuery).count()

    tookMs = System.currentTimeMillis - startMs
    println(s"\nTook $tookMs ms read $count docs using queryShards")

    sc.stop()
    0
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy