com.stratio.crossdata.connector.mongodb.MongoQueryProcessor.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of crossdata-mongodb Show documentation
Show all versions of crossdata-mongodb Show documentation
Crossdata MongoDB connector
/**
* Copyright (C) 2015 Stratio (http://stratio.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.stratio.crossdata.connector.mongodb
import java.util.regex.Pattern
import com.mongodb.casbah.Imports._
import com.mongodb.DBObject
import com.mongodb.QueryBuilder
import com.stratio.datasource.Config
import com.stratio.datasource.mongodb.MongodbConfig
import com.stratio.datasource.mongodb.schema.MongodbRowConverter._
import com.stratio.datasource.mongodb.MongodbRelation._
import org.apache.spark.Logging
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.planning.PhysicalOperation
import org.apache.spark.sql.catalyst.plans.logical.{Limit => LogicalLimit, LogicalPlan}
import org.apache.spark.sql.sources.CatalystToCrossdataAdapter.{BaseLogicalPlan, FilterReport, SimpleLogicalPlan}
import org.apache.spark.sql.sources.CatalystToCrossdataAdapter
import org.apache.spark.sql.sources.{Filter => SourceFilter}
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.Row
import org.apache.spark.sql.sources
object MongoQueryProcessor {
val DefaultLimit = 10000
type ColumnName = String
type Limit = Option[Int]
case class MongoPlan(basePlan: BaseLogicalPlan, limit: Limit){
def projects: Seq[NamedExpression] = basePlan.projects
def filters: Array[SourceFilter] = basePlan.filters
}
def apply(logicalPlan: LogicalPlan, config: Config, schemaProvided: Option[StructType] = None) = new MongoQueryProcessor(logicalPlan, config, schemaProvided)
def buildNativeQuery(
requiredColums: Seq[ColumnName],
filters: Array[SourceFilter],
config: Config
): (DBObject, DBObject) = {
(filtersToDBObject(filters)(config), selectFields(requiredColums))
}
def filtersToDBObject(sFilters: Array[SourceFilter], parentFilterIsNot: Boolean = false)(implicit config: Config): DBObject = {
val queryBuilder: QueryBuilder = QueryBuilder.start
if (parentFilterIsNot) queryBuilder.not()
sFilters.foreach {
case sources.EqualTo(attribute, value) =>
queryBuilder.put(attribute).is(correctIdValue(attribute, value))
case sources.GreaterThan(attribute, value) =>
queryBuilder.put(attribute).greaterThan(correctIdValue(attribute, value))
case sources.GreaterThanOrEqual(attribute, value) =>
queryBuilder.put(attribute).greaterThanEquals(correctIdValue(attribute, value))
case sources.In(attribute, values) =>
queryBuilder.put(attribute).in(values.map(value => correctIdValue(attribute, value)))
case sources.LessThan(attribute, value) =>
queryBuilder.put(attribute).lessThan(correctIdValue(attribute, value))
case sources.LessThanOrEqual(attribute, value) =>
queryBuilder.put(attribute).lessThanEquals(correctIdValue(attribute, value))
case sources.IsNull(attribute) =>
queryBuilder.put(attribute).is(null)
case sources.IsNotNull(attribute) =>
queryBuilder.put(attribute).notEquals(null)
case sources.And(leftFilter, rightFilter) if !parentFilterIsNot =>
queryBuilder.and(filtersToDBObject(Array(leftFilter)), filtersToDBObject(Array(rightFilter)))
case sources.Or(leftFilter, rightFilter) if !parentFilterIsNot =>
queryBuilder.or(filtersToDBObject(Array(leftFilter)), filtersToDBObject(Array(rightFilter)))
case sources.StringStartsWith(attribute, value) if !parentFilterIsNot =>
queryBuilder.put(attribute).regex(Pattern.compile("^" + value + ".*$"))
case sources.StringEndsWith(attribute, value) if !parentFilterIsNot =>
queryBuilder.put(attribute).regex(Pattern.compile("^.*" + value + "$"))
case sources.StringContains(attribute, value) if !parentFilterIsNot =>
queryBuilder.put(attribute).regex(Pattern.compile(".*" + value + ".*"))
case sources.Not(filter) =>
filtersToDBObject(Array(filter), true)
}
queryBuilder.get
}
/**
* Check if the field is "_id" and if the user wants to filter by this field as an ObjectId
*
* @param attribute Name of the file
* @param value Value for the attribute
* @return The value in the correct data type
*/
private def correctIdValue(attribute: String, value: Any)(implicit config: Config) : Any = {
val idAsObjectId: Boolean = config.getOrElse[String](MongodbConfig.IdAsObjectId, MongodbConfig.DefaultIdAsObjectId).equalsIgnoreCase("true")
attribute match {
case "_id" if idAsObjectId => new ObjectId(value.toString)
case _ => value
}
}
/**
*
* Prepared DBObject used to specify required fields in mongodb 'find'
* @param fields Required fields
* @return A mongodb object that represents required fields.
*/
private def selectFields(fields: Seq[ColumnName]): DBObject =
{
MongoDBObject(
fields.toList.filterNot(_ == "_id").map(_ -> 1) ::: {
List("_id" -> fields.find(_ == "_id").fold(0)(_ => 1))
})
/*
For random accesses to array columns elements, a performance improvement is doable
by querying MongoDB in a way that would only select a size-1 slice of the accessed array thanks to
the "$slice" operator. However this operator can only be used once for each column in a projection
which implies that several accesses (e.g: SELECT arraystring[0] as first, arraystring[3] as fourth FROM MONGO_T)
would require to implement an smart "$slice" use selecting the minimum slice containing all requested elements.
That requires way too much effort when the performance boost is taken into consideration.
*/
}
}
// TODO logs, doc, tests
class MongoQueryProcessor(logicalPlan: LogicalPlan, config: Config, schemaProvided: Option[StructType] = None) extends Logging {
import MongoQueryProcessor._
def execute(): Option[Array[Row]] = {
// TODO convert to Spark result using an iterator with batches instead of an array
if (schemaProvided.isEmpty) {
None
} else {
try {
validatedNativePlan.map { case MongoPlan(bs: SimpleLogicalPlan, limit) =>
if (limit.exists(_ == 0)) {
Array.empty[Row]
} else {
val (mongoFilters, mongoRequiredColumns) = buildNativeQuery(bs.projects.map(_.name), bs.filters, config/*, bs.collectionRandomAccesses*/)
val resultSet = MongodbConnection.withCollectionDo(config) { collection =>
logDebug(s"Executing native query: filters => $mongoFilters projects => $mongoRequiredColumns")
val cursor = collection.find(mongoFilters, mongoRequiredColumns)
val result = cursor.limit(limit.getOrElse(DefaultLimit)).toArray[DBObject]
cursor.close()
result
}
sparkResultFromMongodb(bs.projects, bs.collectionRandomAccesses, schemaProvided.get, resultSet)
}
}
} catch {
case exc: Exception =>
log.warn(s"Exception executing the native query $logicalPlan", exc.getMessage); None
}
}
}
def validatedNativePlan: Option[_] = {// TODO
lazy val limit: Option[Int] = logicalPlan.collectFirst { case LogicalLimit(Literal(num: Int, _), _) => num }
def findBasePlan(lplan: LogicalPlan): Option[BaseLogicalPlan] = lplan match {
case LogicalLimit(_, child) =>
findBasePlan(child)
case PhysicalOperation(projectList, filterList, _) =>
CatalystToCrossdataAdapter.getConnectorLogicalPlan(logicalPlan, projectList, filterList) match {
case (_, FilterReport(filtersIgnored, _)) if filtersIgnored.nonEmpty => None
case (basePlan: SimpleLogicalPlan, _) =>
Some(basePlan)
case _ => ??? // TODO
}
}
findBasePlan(logicalPlan).collect{ case bp if checkNativeFilters(bp.filters) => MongoPlan(bp, limit) }
}
private[this] def checkNativeFilters(filters: Seq[SourceFilter]): Boolean = filters.forall {
case _: sources.EqualTo => true
case _: sources.In => true
case _: sources.LessThan => true
case _: sources.GreaterThan => true
case _: sources.LessThanOrEqual => true
case _: sources.GreaterThanOrEqual => true
case _: sources.IsNull => true
case _: sources.IsNotNull => true
case _: sources.StringStartsWith => true
case _: sources.StringEndsWith => true
case _: sources.StringContains => true
case sources.And(left, right) => checkNativeFilters(Array(left, right))
case sources.Or(left, right) => checkNativeFilters(Array(left, right))
case sources.Not(filter) => checkNativeFilters(Array(filter))
// TODO add more filters
case _ => false
}
private[this] def sparkResultFromMongodb(
requiredColumns: Seq[Attribute],
indexAccesses: Map[Attribute, GetArrayItem],
schema: StructType,
resultSet: Array[DBObject]
): Array[Row] = {
asRow(
pruneSchema(
schema,
requiredColumns.map(r => r.name -> indexAccesses.get(r).map(_.right.toString().toInt)).toArray
),
resultSet
)
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy