All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.lucidworks.spark.TimePartitioningQuery.scala Maven / Gradle / Ivy

package com.lucidworks.spark

import java.text.SimpleDateFormat
import java.util.{Date, TimeZone}
import java.util.concurrent.TimeUnit
import java.util.regex.Pattern

import org.apache.solr.client.solrj.SolrQuery

import scala.collection.JavaConverters._
import com.lucidworks.spark.util.QueryConstants._
import com.lucidworks.spark.util.{ConfigurationConstants, SolrQuerySupport, SolrSupport}
import TimePartitioningQuery._
import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser
import org.apache.lucene.search.{Query, TermRangeQuery}
import org.apache.lucene.util.BytesRef
import org.apache.solr.util.DateMathParser

import scala.collection.mutable.ListBuffer

class TimePartitioningQuery(solrConf: SolrConf, query: SolrQuery, partitions: Option[List[String]] = None) extends LazyLogging {

  val dateTimePattern: String = getDateTimePattern
  val dateFormatter: ThreadLocal[SimpleDateFormat] = new ThreadLocal[SimpleDateFormat]() {
    override protected def initialValue: SimpleDateFormat = {
      val sdf: SimpleDateFormat = new SimpleDateFormat(dateTimePattern)
      sdf.setTimeZone(TimeZone.getTimeZone(solrConf.getTimeZoneId.getOrElse(DEFAULT_TIMEZONE_ID)))
      sdf
    }
  }

  def getPartitionsForQuery(): List[String] = {
    val timestampField = solrConf.getTimestampFieldName.getOrElse(DEFAULT_TIMESTAMP_FIELD_NAME)
    val timestampFilterPrefix = s"$timestampField:"

    // Get all partitions from cluster state
    val allPartitions: List[String] = partitions.getOrElse(getPartitions(true))

    if (query.getFilterQueries == null && query.getFilterQueries.isEmpty) {
      logger.warn(s"No filter query found in ${query}")
      return allPartitions
    }
    val rangeQueries = filterRangeQueries(query.getFilterQueries, timestampFilterPrefix)
    // TODO: What to do if there are multiple filter queries
    if (rangeQueries.isEmpty) {
      logger.warn(s"No range queries found for filter queries ${query.getFilterQueries.mkString(",")}. Returning all partitions: ${allPartitions}")
      return allPartitions
    }
    logger.debug(s"All partitions returned for query are: ${allPartitions}")
    getCollectionsForRangeQueries(rangeQueries, allPartitions)
  }

  def filterRangeQueries(filterQueries: Array[String], timestampFilterPrefix: String): Array[String] = {
    filterQueries
      .filter(fq => fq != null && fq.nonEmpty)
      .filter(fq => fq.startsWith(timestampFilterPrefix) && fq.substring(timestampFilterPrefix.length) != "[* TO *]")
  }

  def getPartitions(activeOnly: Boolean): List[String] = {
    if (solrConf.getCollectionAlias.isDefined) {
      val aliasList : Option[List[String]] = SolrQuerySupport.getCollectionsForAlias(solrConf.getZkHost.get, solrConf.getCollectionAlias.get)
      if (aliasList.isDefined) {
        return aliasList.get
      }
    }
    val partitions: List[String] = findAllPartitions
    if (activeOnly) {
      if (solrConf.getMaxActivePartitions.isDefined) {
        val maxActivePartitions = solrConf.getMaxActivePartitions.get.toInt
        if (maxActivePartitions < 1) {
          throw new IllegalArgumentException(s"Invalid value ${maxActivePartitions} for ${ConfigurationConstants.MAX_ACTIVE_PARTITIONS}. Set to positive number ")
        }
        val numOfPartitionsToRemove = partitions.size - maxActivePartitions
        if (numOfPartitionsToRemove > 0) {
          return partitions.slice(numOfPartitionsToRemove, partitions.size)
        }
      }
    }
    partitions
  }

  def findAllPartitions: List[String] = {
    val collections: Set[String] = SolrSupport.getCachedCloudClient(solrConf.getZkHost.get)
        .getZkStateReader.getClusterState.getCollectionsMap.keySet().asScala.toSet
    val partitionMatchRegex: Pattern = getPartitionMatchRegex

    val partitionListBuffer: ListBuffer[String] = ListBuffer.empty[String]
    collections.foreach(coll => {
      val matcher = partitionMatchRegex.matcher(coll)
      if (matcher.matches()) {
        partitionListBuffer.+=(coll)
      }
    })
    partitionListBuffer.toList.sorted
  }

  def getCollectionsForRangeQueries(rangeQueries: Array[String], partitions: List[String]): List[String] = {
    if (rangeQueries.length > 2)
      throw new IllegalArgumentException("Please consolidate date range filter criteria to at most 2 clauses!")
    if (rangeQueries.length == 2) {
      val query1Slice = getCollectionsForRangeQuery(rangeQueries(0), partitions)
      val query2Slice = getCollectionsForRangeQuery(rangeQueries(1), partitions)
      return query1Slice.intersect(query2Slice)
    }
    getCollectionsForRangeQuery(rangeQueries(0), partitions)
  }

  def getCollectionsForRangeQuery(rangeQuery: String, partitions: List[String]): List[String] = {
    val sortedPartitions = partitions.sorted
    val luceneQuery: Query = (new StandardQueryParser).parse(rangeQuery, rangeQuery.substring(0, rangeQuery.indexOf(":")))
    if (!luceneQuery.isInstanceOf[TermRangeQuery]) throw new IllegalArgumentException("Failed to parse " + rangeQuery + " into a Lucene range query!")
    val rq: TermRangeQuery = luceneQuery.asInstanceOf[TermRangeQuery]
    val lower: String = bref2str(rq.getLowerTerm)
    val upper: String = bref2str(rq.getUpperTerm)

    if (lower == null && upper == null) {
      return sortedPartitions
    }
    val fromIndex: Int = if (lower != null) mapToExistingCollIndex(lower, sortedPartitions) else 0
    val toIndex:Int = if (upper != null) mapToExistingCollIndex(upper, sortedPartitions) else sortedPartitions.size - 1
    logger.debug(s"Partitions fromIndex: ${fromIndex}. toIndex: ${toIndex}")
    sortedPartitions.slice(fromIndex, toIndex + 1)
  }

  def mapToExistingCollIndex(crit: String, partitions: List[String]): Int = {
    val collDate = DateMathParser.parseMath(null, crit.toUpperCase)
    val coll: String = getCollectionNameForDate(collDate)

    val size = partitions.size
    val lastIndex = size - 1
    if (coll.compareTo(partitions(lastIndex)) > 0) {
      return lastIndex
    }
    partitions.zipWithIndex.foreach{case(partition, index) =>
      if (coll == partition) return index
      if (index < lastIndex) if (coll.compareTo(partitions(index+1)) < 0) return index
    }
    -1
  }

  def getPartitionMatchRegex: Pattern = {
    var dtRegex: String = dateTimePattern
    dtRegex = dtRegex.replace("yyyy", "(\\d{4})")
    dtRegex = dtRegex.replace("yy", "(\\d{2})")
    dtRegex = dtRegex.replace("MM", "(1[0-2]|0[1-9])")
    dtRegex = dtRegex.replace("dd", "(3[0-1]|[0-2][0-9])")
    var underscore: String = ""
    var hoursAt: Int = dtRegex.indexOf("_HH")
    if (hoursAt != -1) {
      dtRegex = dtRegex.substring(0, hoursAt)
      underscore = "_"
    }
    else {
      hoursAt = dtRegex.indexOf("HH")
      if (hoursAt != -1) {
        dtRegex = dtRegex.substring(0, hoursAt)
      }
    }
    dtRegex += "(" + underscore + "(2[0-3]|[0-1][0-9]))?(" + underscore + "([0-5][0-9]))?"
    Pattern.compile(solrConf.getCollection.get + "_" + dtRegex)
  }

  def getDateTimePattern: String = {
    solrConf.getDateTimePattern.getOrElse(getDefaultDateTimePattern)
  }

  def getDefaultDateTimePattern: String = {
    val timeUnit = getTimeUnit
    if (timeUnit eq TimeUnit.HOURS) "yyyy_MM_dd_HH"
    else if (timeUnit eq TimeUnit.MINUTES) "yyyy_MM_dd_HH_mm"
    else DEFAULT_DATETIME_PATTERN
  }

  def getTimeUnit: TimeUnit = {
    val timePeriod = solrConf.getTimePeriod.getOrElse(DEFAULT_TIME_PERIOD)
    val matcher = TIMEPERIOD_PATTERN.matcher(timePeriod)
    if (!matcher.matches) {
      throw new IllegalArgumentException("Invalid timePeriod "+ timePeriod)
    }
    TimeUnit.valueOf(matcher.group(2))
  }

  def getCollectionNameForDate(date: Date): String = {
    val formattedDate = dateFormatter.get().format(date)
    solrConf.getCollectionAlias.getOrElse(solrConf.getCollection.get) + "_" + formattedDate
  }
}

object TimePartitioningQuery {
  val TIMEPERIOD_PATTERN: Pattern = Pattern.compile("^(\\d{1,4})(MINUTES|HOURS|DAYS|MONTH)$")

  def bref2str(bytesRef: BytesRef): String = {
    if (bytesRef != null) bytesRef.utf8ToString
    else null
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy