com.crealytics.google.analytics.AnalyticsRelation.scala Maven / Gradle / Ivy
The newest version!
package com.crealytics.google.analytics
import java.math.BigDecimal
import java.text.{SimpleDateFormat, NumberFormat}
import java.util.{Calendar, Date, Locale}
import com.google.api.services.analytics.Analytics
import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types._
import scala.collection.JavaConverters._
import scala.util.{Try, Success}
case class AnalyticsRelation protected[crealytics](
analytics: Analytics,
ids: String,
startDate: String,
endDate: String,
calculatedMetrics: Seq[String],
queryIndividualDays: Boolean
)(@transient val sqlContext: SQLContext)
extends BaseRelation with TableScan with PrunedScan with PrunedFilteredScan {
override val schema: StructType = createSchemaFromColumns(allColumns)
override def buildScan: RDD[Row] = buildScan(schema.map(_.name).toArray, Array())
override def buildScan(requiredColumns: Array[String]): RDD[Row] = buildScan(requiredColumns, Array())
override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = {
val results = getResults(ids, startDate, endDate, requiredColumns, filters)
sqlContext.sparkContext.parallelize(results.map(Row.fromSeq))
}
private def nDaysAgo(beginDate: Date, n: Integer) = {
val cal = calendarForDate(beginDate)
cal.add(Calendar.DATE, n)
cal.getTime
}
val analyticsDateFormat = new SimpleDateFormat("yyyy-MM-dd")
private def parseGoogleDate(date: String) = {
if (date == "today") nDaysAgo(new Date(), 0)
else if (date == "yesterday") nDaysAgo(new Date(), 1)
else if (date.endsWith("daysAgo")) nDaysAgo(new Date(), date.replace("daysAgo", "").toInt)
else analyticsDateFormat.parse(date)
}
private def calendarForDate(date: Date) = {
val calendar = Calendar.getInstance
calendar.setTime(date)
calendar
}
private def getDateRange = {
val end = calendarForDate(parseGoogleDate(endDate))
Iterator.iterate(calendarForDate(parseGoogleDate(startDate))) { d =>
d.add(Calendar.DATE, 1)
d
}.takeWhile(!_.after(end)).map(dt => analyticsDateFormat.format(dt.getTime))
}
lazy val defaultColumns = analytics.metadata.columns.list("ga").execute.getItems.asScala.map{ c =>
if (c.getId == "ga:calcMetric_" || c.getId == "ga:metricXX") c.getAttributes.put("dataType", "DECIMAL"); c
}
lazy val calculatedMetricTemplate = defaultColumns.find(_.getId == "ga:calcMetric_").get
lazy val allColumns = defaultColumns ++ calculatedMetrics.map(
name => calculatedMetricTemplate.clone.setId("ga:calcMetric_" + name)
)
private def createSchemaFromColumns(columns: Seq[com.google.api.services.analytics.model.Column]) =
columns.foldLeft(new StructType) {
case (struct, column) =>
val attributes = column.getAttributes
val dataType = sparkDataTypeForGoogleDataType(attributes.get("dataType"))
val templateBounds: Option[(Int, Int)] = for {
minTemplateIndex <- Option(attributes.get("minTemplateIndex")).map(_.toInt)
maxTemplateIndex <- Option(attributes.get("maxTemplateIndex")).map(_.toInt)
} yield((minTemplateIndex, maxTemplateIndex))
val columnNames = templateBounds.map { case (minIndex, maxIndex) =>
(minIndex to maxIndex).map(i => column.getId.replaceFirst("XX", s"$i"))
}.getOrElse(Seq(column.getId))
val isDimension = attributes.get("type") == "DIMENSION"
columnNames.foldLeft(struct) { case(s, c) =>
s.add(
c.replaceFirst("ga:", ""),
dataType,
nullable = true,
metadata = new MetadataBuilder().putBoolean("isDimension", isDimension).build
)
}
}
implicit class RichStructType(st: StructType) {
def filterByIsDimension(isDimension: Boolean = true): StructType = {
StructType(st.filter(_.metadata.getBoolean("isDimension") == isDimension))
}
def dimensionsAndMetrics: (StructType, StructType) = {
val (dimensions, metrics) = st.partition(_.metadata.getBoolean("isDimension"))
(StructType(dimensions), StructType(metrics))
}
}
val (allMetrics, allDimensions) = schema.dimensionsAndMetrics
private def sparkDataTypeForGoogleDataType(dataType: String) = dataType match {
case "PERCENT" => "DECIMAL"
case "CURRENCY" => "DECIMAL"
case "TIME" => "DECIMAL"
case t => t
}
private def castTo(datum: String, castType: DataType): Any = {
castType match {
case _: ByteType => datum.toByte
case _: ShortType => datum.toShort
case _: IntegerType => datum.toInt
case _: LongType => datum.toLong
case _: FloatType => Try(datum.toFloat)
.getOrElse(NumberFormat.getInstance(Locale.getDefault).parse(datum).floatValue())
case _: DoubleType => Try(datum.toDouble)
.getOrElse(NumberFormat.getInstance(Locale.getDefault).parse(datum).doubleValue())
case _: BooleanType => datum.toBoolean
case _: DecimalType => new BigDecimal(datum.replaceAll(",", ""))
case _: TimestampType => java.sql.Timestamp.valueOf(datum)
case _: DateType => java.sql.Date.valueOf(datum)
case _: StringType => datum
case _ => throw new RuntimeException(s"Unsupported type: ${castType.typeName}")
}
}
def combineFilters(filters: Array[Filter]): String = {
def convertFilter(filter: Filter): String = filter match {
case EqualTo(attribute, value) => s"ga:$attribute==$value"
case Not(EqualTo(attribute, value)) => s"ga:$attribute!=$value"
case EqualNullSafe(attribute, value) => s"ga:$attribute==$value"
case Not(EqualNullSafe(attribute, value)) => s"ga:$attribute!=$value"
case GreaterThan(attribute, value) => s"ga:$attribute>$value"
case GreaterThanOrEqual(attribute, value) => s"ga:$attribute>=$value"
case LessThan(attribute, value) => s"ga:$attribute<$value"
case LessThanOrEqual(attribute, value) => s"ga:$attribute<=$value"
case In(attribute, values) => s"ga:$attribute[]${values.mkString("|")}"
case And(lhs, rhs) => Seq(lhs, rhs).map(convertFilter).mkString(";")
case Or(lhs, rhs) => Seq(lhs, rhs).map(convertFilter).mkString(",")
case StringStartsWith(attribute, value) => s"ga:$attribute=~^$value"
case StringEndsWith(attribute, value) => s"ga:$attribute=~$value$$"
case StringContains(attribute, value) => s"ga:$attribute=@$value"
case IsNull(attribute) => ???
case IsNotNull(attribute) => ???
case Not(_) => ???
}
filters.map(convertFilter).mkString(";")
}
@annotation.tailrec
private final def retry[T](n: Int)(fn: => T): Try[T] = {
Try(fn) match {
case x: Success[T] => x
case _ if n > 1 => retry(n - 1)(fn)
case f => f
}
}
private def getResults(ids: String, startDate: String, endDate: String,
requiredColumns: Seq[String], filters: Array[Filter]): Seq[Seq[Any]] = {
val requiredSchema = StructType(requiredColumns.map(c => schema.find(_.name == c).get))
val (requiredDimensions, rawMetrics) = requiredSchema.dimensionsAndMetrics
if (queryIndividualDays && !requiredDimensions.map(_.name).contains("date")) {
throw new IllegalArgumentException("If you use queryIndividualDays, you must select the date dimension.")
}
val requiredMetrics = if (rawMetrics.nonEmpty) rawMetrics
// We need at least 1 metric, otherwise Google complains
else Seq[StructField](allMetrics.head)
val maxPageSize = 10000
val filtersString = combineFilters(filters)
def queryDateRange(startDate: String, endDate: String) = {
val queryWithoutFilter = analytics.data().ga()
.get(ids, startDate, endDate, requiredMetrics.map("ga:" + _.name).mkString(","))
.setDimensions(requiredDimensions.map("ga:" + _.name).mkString(","))
.setMaxResults(maxPageSize)
val query = if (filters.length > 0) queryWithoutFilter.setFilters(filtersString) else queryWithoutFilter
val firstResult = query.execute
val requiredPages = firstResult.getTotalResults / maxPageSize
val restResults = (1 to requiredPages).flatMap { pageNum =>
// The analytics API is 1-based, so we need to add 1 in order not to get duplicates
retry(3)(query.setStartIndex(pageNum * maxPageSize + 1).execute.getRows.asScala).get
}
val columnHeaders = firstResult.getColumnHeaders.asScala
val firstRows = Option(firstResult.getRows).getOrElse(java.util.Collections.emptyList).asScala
val combinedResult = (firstRows ++ restResults).map(_.asScala)
val maps = combinedResult.map { line =>
columnHeaders.zip(line).flatMap { case (header, cell) =>
val name = header.getName.replaceFirst("ga:", "")
if (requiredColumns.contains(name)) {
val dataType = requiredSchema.apply(name).dataType
Some(name -> castTo(cell, dataType))
} else None
}.toMap
}
maps.map(m => requiredColumns.map(m(_)))
}
if (queryIndividualDays) {
getDateRange.map(date => queryDateRange(date, date)).reduce(_ union _)
} else {
queryDateRange(startDate, endDate)
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy